aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig1
-rw-r--r--fs/9p/acl.c11
-rw-r--r--fs/9p/acl.h27
-rw-r--r--fs/9p/cache.c141
-rw-r--r--fs/9p/cache.h97
-rw-r--r--fs/9p/fid.c3
-rw-r--r--fs/9p/v9fs.c22
-rw-r--r--fs/9p/v9fs.h17
-rw-r--r--fs/9p/v9fs_vfs.h11
-rw-r--r--fs/9p/vfs_addr.c266
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c6
-rw-r--r--fs/9p/vfs_file.c32
-rw-r--r--fs/9p/vfs_inode.c29
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/9p/xattr.c10
-rw-r--r--fs/9p/xattr.h29
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/dir.c229
-rw-r--r--fs/afs/dir_edit.c154
-rw-r--r--fs/afs/file.c82
-rw-r--r--fs/afs/inode.c6
-rw-r--r--fs/afs/internal.h49
-rw-r--r--fs/afs/write.c354
-rw-r--r--fs/afs/yfsclient.c32
-rw-r--r--fs/aio.c9
-rw-r--r--fs/anon_inodes.c29
-rw-r--r--fs/binfmt_elf.c37
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/btrfs/block-group.c242
-rw-r--r--fs/btrfs/block-group.h8
-rw-r--r--fs/btrfs/btrfs_inode.h46
-rw-r--r--fs/btrfs/check-integrity.c205
-rw-r--r--fs/btrfs/compression.c682
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c157
-rw-r--r--fs/btrfs/ctree.h84
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-ref.c17
-rw-r--r--fs/btrfs/delayed-ref.h51
-rw-r--r--fs/btrfs/dev-replace.c19
-rw-r--r--fs/btrfs/disk-io.c53
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-tree.c326
-rw-r--r--fs/btrfs/extent_io.c334
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file-item.c21
-rw-r--r--fs/btrfs/file.c177
-rw-r--r--fs/btrfs/free-space-cache.c24
-rw-r--r--fs/btrfs/inode.c620
-rw-r--r--fs/btrfs/ioctl.c1013
-rw-r--r--fs/btrfs/locking.h7
-rw-r--r--fs/btrfs/lzo.c289
-rw-r--r--fs/btrfs/raid56.c175
-rw-r--r--fs/btrfs/raid56.h22
-rw-r--r--fs/btrfs/reada.c26
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/reflink.c4
-rw-r--r--fs/btrfs/relocation.c81
-rw-r--r--fs/btrfs/root-tree.c6
-rw-r--r--fs/btrfs/scrub.c139
-rw-r--r--fs/btrfs/send.c38
-rw-r--r--fs/btrfs/send.h7
-rw-r--r--fs/btrfs/space-info.c28
-rw-r--r--fs/btrfs/subpage.c290
-rw-r--r--fs/btrfs/subpage.h56
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/sysfs.c93
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c12
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/transaction.c11
-rw-r--r--fs/btrfs/tree-log.c745
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c602
-rw-r--r--fs/btrfs/volumes.h119
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zoned.c531
-rw-r--r--fs/btrfs/zoned.h39
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/cachefiles/io.c12
-rw-r--r--fs/cachefiles/rdwr.c16
-rw-r--r--fs/ceph/addr.c80
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/locks.c3
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/cifsglob.h3
-rw-r--r--fs/cifs/connect.c26
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/cifs/fs_context.c16
-rw-r--r--fs/cifs/fs_context.h2
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/smb2maperror.c16
-rw-r--r--fs/cifs/smb2misc.c47
-rw-r--r--fs/cifs/smb2ops.c73
-rw-r--r--fs/cifs/smb2pdu.c187
-rw-r--r--fs/cifs/smb2pdu.h919
-rw-r--r--fs/cifs/smb2proto.h2
-rw-r--r--fs/cifs/smb2transport.c36
-rw-r--r--fs/cifs/trace.h71
-rw-r--r--fs/coda/cnode.c13
-rw-r--r--fs/coda/coda_linux.c39
-rw-r--r--fs/coda/coda_linux.h6
-rw-r--r--fs/coda/dir.c20
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/psdev.c14
-rw-r--r--fs/coda/upcall.c3
-rw-r--r--fs/coredump.c88
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/bio.c32
-rw-r--r--fs/crypto/fname.c3
-rw-r--r--fs/crypto/fscrypt_private.h16
-rw-r--r--fs/crypto/hkdf.c11
-rw-r--r--fs/crypto/keysetup.c62
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/direct-io.c16
-rw-r--r--fs/erofs/Kconfig40
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h28
-rw-r--r--fs/erofs/data.c75
-rw-r--r--fs/erofs/decompressor.c139
-rw-r--r--fs/erofs/decompressor_lzma.c290
-rw-r--r--fs/erofs/erofs_fs.h73
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/internal.h105
-rw-r--r--fs/erofs/pcpubuf.c6
-rw-r--r--fs/erofs/super.c231
-rw-r--r--fs/erofs/utils.c19
-rw-r--r--fs/erofs/xattr.c4
-rw-r--r--fs/erofs/zdata.c208
-rw-r--r--fs/erofs/zdata.h8
-rw-r--r--fs/erofs/zmap.c65
-rw-r--r--fs/erofs/zpvec.h13
-rw-r--r--fs/exec.c16
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/extents.c175
-rw-r--r--fs/ext4/fast_commit.c11
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/inode.c331
-rw-r--r--fs/ext4/mballoc.c5
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c8
-rw-r--r--fs/ext4/super.c26
-rw-r--r--fs/f2fs/checkpoint.c8
-rw-r--r--fs/f2fs/compress.c21
-rw-r--r--fs/f2fs/data.c95
-rw-r--r--fs/f2fs/f2fs.h54
-rw-r--r--fs/f2fs/file.c8
-rw-r--r--fs/f2fs/gc.c5
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/inode.c4
-rw-r--r--fs/f2fs/namei.c32
-rw-r--r--fs/f2fs/node.c1
-rw-r--r--fs/f2fs/node.h5
-rw-r--r--fs/f2fs/recovery.c14
-rw-r--r--fs/f2fs/segment.c83
-rw-r--r--fs/f2fs/segment.h1
-rw-r--r--fs/f2fs/super.c40
-rw-r--r--fs/f2fs/sysfs.c24
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/f2fs/xattr.c2
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--fs/fuse/dax.c5
-rw-r--r--fs/fuse/dev.c24
-rw-r--r--fs/fuse/dir.c128
-rw-r--r--fs/fuse/file.c110
-rw-r--r--fs/fuse/fuse_i.h17
-rw-r--r--fs/fuse/inode.c45
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/readdir.c6
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/fuse/xattr.c10
-rw-r--r--fs/gfs2/bmap.c60
-rw-r--r--fs/gfs2/file.c269
-rw-r--r--fs/gfs2/glock.c471
-rw-r--r--fs/gfs2/glock.h34
-rw-r--r--fs/gfs2/glops.c29
-rw-r--r--fs/gfs2/incore.h10
-rw-r--r--fs/gfs2/inode.c12
-rw-r--r--fs/gfs2/rgrp.c70
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/trace_gfs2.h9
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/hfs/inode.c6
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfsplus/inode.c12
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hpfs/hpfs.h8
-rw-r--r--fs/hugetlbfs/inode.c23
-rw-r--r--fs/inode.c53
-rw-r--r--fs/internal.h12
-rw-r--r--fs/io-wq.c78
-rw-r--r--fs/io-wq.h59
-rw-r--r--fs/io_uring.c1816
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/iomap/direct-io.c88
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_mount.c51
-rw-r--r--fs/jfs/resize.c5
-rw-r--r--fs/jfs/super.c5
-rw-r--r--fs/kernfs/symlink.c3
-rw-r--r--fs/libfs.c29
-rw-r--r--fs/lockd/clntproc.c3
-rw-r--r--fs/lockd/svc.c6
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/lockd/xdr.c152
-rw-r--r--fs/lockd/xdr4.c153
-rw-r--r--fs/locks.c161
-rw-r--r--fs/namei.c4
-rw-r--r--fs/netfs/read_helper.c165
-rw-r--r--fs/nfs/blocklayout/dev.c4
-rw-r--r--fs/nfs/callback_proc.c3
-rw-r--r--fs/nfs/callback_xdr.c4
-rw-r--r--fs/nfs/client.c39
-rw-r--r--fs/nfs/delegation.c10
-rw-r--r--fs/nfs/dir.c119
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/export.c44
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c4
-rw-r--r--fs/nfs/getroot.c21
-rw-r--r--fs/nfs/inode.c130
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs3proc.c10
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c65
-rw-r--r--fs/nfs/nfs4file.c14
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4proc.c292
-rw-r--r--fs/nfs/nfs4session.c12
-rw-r--r--fs/nfs/nfs4session.h1
-rw-r--r--fs/nfs/nfs4state.c5
-rw-r--r--fs/nfs/nfs4trace.h920
-rw-r--r--fs/nfs/nfs4xdr.c81
-rw-r--r--fs/nfs/nfstrace.h467
-rw-r--r--fs/nfs/pagelist.c13
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/pnfs_nfs.c6
-rw-r--r--fs/nfs/proc.c16
-rw-r--r--fs/nfs/read.c11
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfs/write.c73
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/blocklayout.c158
-rw-r--r--fs/nfsd/filecache.c3
-rw-r--r--fs/nfsd/flexfilelayout.c2
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/nfs2acl.c44
-rw-r--r--fs/nfsd/nfs3acl.c48
-rw-r--r--fs/nfsd/nfs3proc.c3
-rw-r--r--fs/nfsd/nfs3xdr.c387
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c5
-rw-r--r--fs/nfsd/nfs4proc.c11
-rw-r--r--fs/nfsd/nfs4state.c6
-rw-r--r--fs/nfsd/nfs4xdr.c52
-rw-r--r--fs/nfsd/nfscache.c17
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfsfh.c173
-rw-r--r--fs/nfsd/nfsfh.h55
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c28
-rw-r--r--fs/nfsd/nfsxdr.c187
-rw-r--r--fs/nfsd/trace.h1
-rw-r--r--fs/nfsd/vfs.c7
-rw-r--r--fs/nfsd/xdr.h37
-rw-r--r--fs/nfsd/xdr3.h63
-rw-r--r--fs/nfsd/xdr4.h7
-rw-r--r--fs/nilfs2/alloc.c2
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/bmap.h2
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/btnode.h2
-rw-r--r--fs/nilfs2/btree.c2
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dat.h2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c2
-rw-r--r--fs/nilfs2/direct.h2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/ifile.c2
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/page.h2
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segbuf.h2
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/sysfs.c78
-rw-r--r--fs/nilfs2/sysfs.h2
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/notify/fanotify/fanotify.c117
-rw-r--r--fs/notify/fanotify/fanotify.h54
-rw-r--r--fs/notify/fanotify/fanotify_user.c157
-rw-r--r--fs/notify/fsnotify.c10
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--fs/notify/inotify/inotify_user.c6
-rw-r--r--fs/notify/notification.c14
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/super.c8
-rw-r--r--fs/ntfs3/file.c3
-rw-r--r--fs/ntfs3/inode.c2
-rw-r--r--fs/ntfs3/super.c2
-rw-r--r--fs/ocfs2/alloc.c21
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/journal.c31
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/super.c40
-rw-r--r--fs/open.c18
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/orangefs/super.c5
-rw-r--r--fs/overlayfs/copy_up.c23
-rw-r--r--fs/overlayfs/dir.c3
-rw-r--r--fs/overlayfs/file.c20
-rw-r--r--fs/overlayfs/inode.c5
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c12
-rw-r--r--fs/posix_acl.c3
-rw-r--r--fs/proc/array.c13
-rw-r--r--fs/proc/base.c40
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/task_mmu.c28
-rw-r--r--fs/proc/uptime.c14
-rw-r--r--fs/proc/vmcore.c109
-rw-r--r--fs/pstore/blk.c8
-rw-r--r--fs/quota/quota.c1
-rw-r--r--fs/quota/quota_tree.c15
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/reiserfs/super.c14
-rw-r--r--fs/seq_file.c16
-rw-r--r--fs/smbfs_common/smb2pdu.h989
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/super.c3
-rw-r--r--fs/sync.c62
-rw-r--r--fs/sysfs/dir.c3
-rw-r--r--fs/sysfs/file.c140
-rw-r--r--fs/sysfs/group.c15
-rw-r--r--fs/sysfs/sysfs.h8
-rw-r--r--fs/sysv/super.c6
-rw-r--r--fs/tracefs/inode.c3
-rw-r--r--fs/ubifs/crypto.c1
-rw-r--r--fs/udf/lowlevel.c5
-rw-r--r--fs/udf/super.c9
-rw-r--r--fs/xfs/kmem.h4
-rw-r--r--fs/xfs/libxfs/xfs_ag.c2
-rw-r--r--fs/xfs/libxfs/xfs_ag.h36
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c120
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h38
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c63
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c101
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h35
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c62
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c333
-rw-r--r--fs/xfs/libxfs/xfs_btree.h99
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c8
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c241
-rw-r--r--fs/xfs/libxfs/xfs_defer.h41
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c90
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c6
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c46
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h7
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c21
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c116
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c18
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h9
-rw-r--r--fs/xfs/scrub/agheader.c13
-rw-r--r--fs/xfs/scrub/agheader_repair.c8
-rw-r--r--fs/xfs/scrub/bitmap.c22
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/btree.c121
-rw-r--r--fs/xfs/scrub/btree.h17
-rw-r--r--fs/xfs/scrub/dabtree.c62
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/scrub.c64
-rw-r--r--fs/xfs/scrub/trace.c11
-rw-r--r--fs/xfs/scrub/trace.h10
-rw-r--r--fs/xfs/xfs_aops.c15
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c18
-rw-r--r--fs/xfs/xfs_bmap_item.h6
-rw-r--r--fs/xfs/xfs_buf.c14
-rw-r--r--fs/xfs/xfs_buf_item.c8
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_buf_item_recover.c2
-rw-r--r--fs/xfs/xfs_dquot.c28
-rw-r--r--fs/xfs/xfs_extfree_item.c33
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_file.c8
-rw-r--r--fs/xfs/xfs_icache.c10
-rw-r--r--fs/xfs/xfs_icreate_item.c6
-rw-r--r--fs/xfs/xfs_icreate_item.h2
-rw-r--r--fs/xfs/xfs_inode.c12
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_log.c6
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c12
-rw-r--r--fs/xfs/xfs_mount.c14
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_qm.c2
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_refcount_item.c18
-rw-r--r--fs/xfs/xfs_refcount_item.h6
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rmap_item.c18
-rw-r--r--fs/xfs/xfs_rmap_item.h6
-rw-r--r--fs/xfs/xfs_super.c233
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--fs/xfs/xfs_trans.c16
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/zonefs/super.c6
469 files changed, 15053 insertions, 11181 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 09fd4a185fd2..d7bc93447c85 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -2,6 +2,7 @@
config 9P_FS
tristate "Plan 9 Resource Sharing Support (9P2000)"
depends on INET && NET_9P
+ select NETFS_SUPPORT
help
If you say Y here, you will get experimental support for
Plan 9 resource sharing via the 9P2000 protocol.
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index c381499f5416..4dac4a0dc5f4 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/module.h>
@@ -123,6 +115,7 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
char *name;
size_t size;
void *buffer;
+
if (!acl)
return 0;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index d43c8949e807..ce5175d463dd 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -1,28 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#ifndef FS_9P_ACL_H
#define FS_9P_ACL_H
#ifdef CONFIG_9P_FS_POSIX_ACL
-extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu);
-extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
-extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
- struct posix_acl *, struct posix_acl *);
-extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
- struct posix_acl **dpacl, struct posix_acl **pacl);
-extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
+int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
+struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+ bool rcu);
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+ struct posix_acl *dacl, struct posix_acl *acl);
+int v9fs_acl_mode(struct inode *dir, umode_t *modep,
+ struct posix_acl **dpacl, struct posix_acl **pacl);
+void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
#define v9fs_iop_get_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 1769a44f4819..f2ba131cede1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -19,8 +19,8 @@
#define CACHETAG_LEN 11
struct fscache_netfs v9fs_cache_netfs = {
- .name = "9p",
- .version = 0,
+ .name = "9p",
+ .version = 0,
};
/*
@@ -199,140 +199,3 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
mutex_unlock(&v9inode->fscache_lock);
}
-
-int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- struct inode *inode = page->mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- BUG_ON(!v9inode->fscache);
-
- return fscache_maybe_release_page(v9inode->fscache, page, gfp);
-}
-
-void __v9fs_fscache_invalidate_page(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- BUG_ON(!v9inode->fscache);
-
- if (PageFsCache(page)) {
- fscache_wait_on_page_write(v9inode->fscache, page);
- BUG_ON(!PageLocked(page));
- fscache_uncache_page(v9inode->fscache, page);
- }
-}
-
-static void v9fs_vfs_readpage_complete(struct page *page, void *data,
- int error)
-{
- if (!error)
- SetPageUptodate(page);
-
- unlock_page(page);
-}
-
-/*
- * __v9fs_readpage_from_fscache - read a page from cache
- *
- * Returns 0 if the pages are in cache and a BIO is submitted,
- * 1 if the pages are not in cache and -error otherwise.
- */
-
-int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- if (!v9inode->fscache)
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_page(v9inode->fscache,
- page,
- v9fs_vfs_readpage_complete,
- NULL,
- GFP_KERNEL);
- switch (ret) {
- case -ENOBUFS:
- case -ENODATA:
- p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
- return 1;
- case 0:
- p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
- return ret;
- default:
- p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
- return ret;
- }
-}
-
-/*
- * __v9fs_readpages_from_fscache - read multiple pages from cache
- *
- * Returns 0 if the pages are in cache and a BIO is submitted,
- * 1 if the pages are not in cache and -error otherwise.
- */
-
-int __v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
- if (!v9inode->fscache)
- return -ENOBUFS;
-
- ret = fscache_read_or_alloc_pages(v9inode->fscache,
- mapping, pages, nr_pages,
- v9fs_vfs_readpage_complete,
- NULL,
- mapping_gfp_mask(mapping));
- switch (ret) {
- case -ENOBUFS:
- case -ENODATA:
- p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
- return 1;
- case 0:
- BUG_ON(!list_empty(pages));
- BUG_ON(*nr_pages != 0);
- p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
- return ret;
- default:
- p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
- return ret;
- }
-}
-
-/*
- * __v9fs_readpage_to_fscache - write a page to the cache
- *
- */
-
-void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
-{
- int ret;
- const struct v9fs_inode *v9inode = V9FS_I(inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- ret = fscache_write_page(v9inode->fscache, page,
- i_size_read(&v9inode->vfs_inode), GFP_KERNEL);
- p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
- if (ret != 0)
- v9fs_uncache_page(inode, page);
-}
-
-/*
- * wait for a page to complete writing to the cache
- */
-void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
-{
- const struct v9fs_inode *v9inode = V9FS_I(inode);
- p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- if (PageFsCache(page))
- fscache_wait_on_page_write(v9inode->fscache, page);
-}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 00f107af443e..7480b4b49fea 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -7,9 +7,10 @@
#ifndef _9P_CACHE_H
#define _9P_CACHE_H
-#ifdef CONFIG_9P_FSCACHE
+#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
-#include <linux/spinlock.h>
+
+#ifdef CONFIG_9P_FSCACHE
extern struct fscache_netfs v9fs_cache_netfs;
extern const struct fscache_cookie_def v9fs_cache_session_index_def;
@@ -27,64 +28,6 @@ extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
extern int __v9fs_cache_register(void);
extern void __v9fs_cache_unregister(void);
-extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
-extern void __v9fs_fscache_invalidate_page(struct page *page);
-extern int __v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page);
-extern int __v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages);
-extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
-extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page);
-
-static inline int v9fs_fscache_release_page(struct page *page,
- gfp_t gfp)
-{
- return __v9fs_fscache_release_page(page, gfp);
-}
-
-static inline void v9fs_fscache_invalidate_page(struct page *page)
-{
- __v9fs_fscache_invalidate_page(page);
-}
-
-static inline int v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- return __v9fs_readpage_from_fscache(inode, page);
-}
-
-static inline int v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return __v9fs_readpages_from_fscache(inode, mapping, pages,
- nr_pages);
-}
-
-static inline void v9fs_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{
- if (PageFsCache(page))
- __v9fs_readpage_to_fscache(inode, page);
-}
-
-static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- fscache_uncache_page(v9inode->fscache, page);
- BUG_ON(PageFsCache(page));
-}
-
-static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
-{
- return __v9fs_fscache_wait_on_page_write(inode, page);
-}
-
#else /* CONFIG_9P_FSCACHE */
static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
@@ -99,39 +42,5 @@ static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file
{
}
-static inline int v9fs_fscache_release_page(struct page *page,
- gfp_t gfp) {
- return 1;
-}
-
-static inline void v9fs_fscache_invalidate_page(struct page *page) {}
-
-static inline int v9fs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- return -ENOBUFS;
-}
-
-static inline int v9fs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- return -ENOBUFS;
-}
-
-static inline void v9fs_readpage_to_fscache(struct inode *inode,
- struct page *page)
-{}
-
-static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
-{}
-
-static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
- struct page *page)
-{
- return;
-}
-
#endif /* CONFIG_9P_FSCACHE */
#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b8863dd0de5c..6aab046c98e2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -103,6 +103,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
/* we'll recheck under lock if there's anything to look in */
if (!ret && dentry->d_fsdata) {
struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+
spin_lock(&dentry->d_lock);
hlist_for_each_entry(fid, h, dlist) {
if (any || uid_eq(fid->uid, uid)) {
@@ -185,7 +186,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
return ERR_PTR(-EPERM);
if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
- uname = NULL;
+ uname = NULL;
else
uname = v9ses->uname;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2e0fa7c932db..e32dd5f7721b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/v9fs.c
- *
* This file contains functions assisting in mapping VFS to 9P2000
*
* Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -166,7 +164,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
substring_t args[MAX_OPT_ARGS];
char *p;
int option = 0;
- char *s, *e;
+ char *s;
int ret = 0;
/* setup defaults */
@@ -190,8 +188,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
while ((p = strsep(&options, ",")) != NULL) {
int token, r;
+
if (!*p)
continue;
+
token = match_token(p, tokens, args);
switch (token) {
case Opt_debug:
@@ -321,12 +321,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->flags |= V9FS_ACCESS_CLIENT;
} else {
uid_t uid;
+
v9ses->flags |= V9FS_ACCESS_SINGLE;
- uid = simple_strtoul(s, &e, 10);
- if (*e != '\0') {
- ret = -EINVAL;
- pr_info("Unknown access argument %s\n",
- s);
+ r = kstrtouint(s, 10, &uid);
+ if (r) {
+ ret = r;
+ pr_info("Unknown access argument %s: %d\n",
+ s, r);
kfree(s);
continue;
}
@@ -520,7 +521,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
* mark transport as disconnected and cancel all pending requests.
*/
-void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+void v9fs_session_cancel(struct v9fs_session_info *v9ses)
+{
p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
p9_client_disconnect(v9ses->clnt);
}
@@ -659,6 +661,7 @@ static void v9fs_destroy_inode_cache(void)
static int v9fs_cache_register(void)
{
int ret;
+
ret = v9fs_init_inode_cache();
if (ret < 0)
return ret;
@@ -686,6 +689,7 @@ static void v9fs_cache_unregister(void)
static int __init init_v9fs(void)
{
int err;
+
pr_info("Installing v9fs 9p2000 file system support\n");
/* TODO: Setup list of registered trasnport modules */
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 4ca56c5dd637..1647a8e63671 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -124,15 +124,24 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
return container_of(inode, struct v9fs_inode, vfs_inode);
}
+static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
+{
+#ifdef CONFIG_9P_FSCACHE
+ return v9inode->fscache;
+#else
+ return NULL;
+#endif
+}
+
extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
-struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
- char *);
+struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
+ const char *dev_name, char *data);
extern void v9fs_session_close(struct v9fs_session_info *v9ses);
extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags);
+ unsigned int flags);
extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct user_namespace *mnt_userns,
@@ -158,7 +167,7 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
{
- return (inode->i_sb->s_fs_info);
+ return inode->i_sb->s_fs_info;
}
static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index d44ade76966a..bc417da7e9c1 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,9 +44,10 @@ extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t);
+struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
+ dev_t rdev);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, umode_t mode, dev_t);
+ struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
@@ -59,8 +60,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
int v9fs_uflags2omode(int uflags, int extended);
void v9fs_blank_wstat(struct p9_wstat *wstat);
-int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *,
- struct iattr *);
+int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr);
int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
int datasync);
int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
@@ -68,9 +69,9 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
static inline void v9fs_invalidate_inode_attr(struct inode *inode)
{
struct v9fs_inode *v9inode;
+
v9inode = V9FS_I(inode);
v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
- return;
}
int v9fs_open_to_dotl_flags(int flags);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 1c4f1b39cc95..fac918ccb305 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_addr.c
- *
* This file contians vfs address (mmap) ops for 9P2000.
*
* Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -19,7 +17,7 @@
#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/uio.h>
-#include <linux/bvec.h>
+#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -29,88 +27,99 @@
#include "fid.h"
/**
- * v9fs_fid_readpage - read an entire page in from 9P
- * @data: Opaque pointer to the fid being read
- * @page: structure to page
- *
+ * v9fs_req_issue_op - Issue a read from 9P
+ * @subreq: The read to make
*/
-static int v9fs_fid_readpage(void *data, struct page *page)
+static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq)
{
- struct p9_fid *fid = data;
- struct inode *inode = page->mapping->host;
- struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
+ struct netfs_read_request *rreq = subreq->rreq;
+ struct p9_fid *fid = rreq->netfs_priv;
struct iov_iter to;
- int retval, err;
+ loff_t pos = subreq->start + subreq->transferred;
+ size_t len = subreq->len - subreq->transferred;
+ int total, err;
- p9_debug(P9_DEBUG_VFS, "\n");
+ iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
- BUG_ON(!PageLocked(page));
+ total = p9_client_read(fid, pos, &to, &err);
+ netfs_subreq_terminated(subreq, err ?: total, false);
+}
- retval = v9fs_readpage_from_fscache(inode, page);
- if (retval == 0)
- return retval;
+/**
+ * v9fs_init_rreq - Initialise a read request
+ * @rreq: The read request
+ * @file: The file being read from
+ */
+static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+{
+ struct p9_fid *fid = file->private_data;
- iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE);
+ refcount_inc(&fid->count);
+ rreq->netfs_priv = fid;
+}
- retval = p9_client_read(fid, page_offset(page), &to, &err);
- if (err) {
- v9fs_uncache_page(inode, page);
- retval = err;
- goto done;
- }
+/**
+ * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq
+ * @mapping: unused mapping of request to cleanup
+ * @priv: private data to cleanup, a fid, guaranted non-null.
+ */
+static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
+{
+ struct p9_fid *fid = priv;
- zero_user(page, retval, PAGE_SIZE - retval);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ p9_client_clunk(fid);
+}
- v9fs_readpage_to_fscache(inode, page);
- retval = 0;
+/**
+ * v9fs_is_cache_enabled - Determine if caching is enabled for an inode
+ * @inode: The inode to check
+ */
+static bool v9fs_is_cache_enabled(struct inode *inode)
+{
+ struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode));
-done:
- unlock_page(page);
- return retval;
+ return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
}
/**
- * v9fs_vfs_readpage - read an entire page in from 9P
- *
- * @filp: file being read
- * @page: structure to page
- *
+ * v9fs_begin_cache_operation - Begin a cache operation for a read
+ * @rreq: The read request
*/
-
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+static int v9fs_begin_cache_operation(struct netfs_read_request *rreq)
{
- return v9fs_fid_readpage(filp->private_data, page);
+ struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
+
+ return fscache_begin_read_operation(rreq, cookie);
}
+static const struct netfs_read_request_ops v9fs_req_ops = {
+ .init_rreq = v9fs_init_rreq,
+ .is_cache_enabled = v9fs_is_cache_enabled,
+ .begin_cache_operation = v9fs_begin_cache_operation,
+ .issue_op = v9fs_req_issue_op,
+ .cleanup = v9fs_req_cleanup,
+};
+
/**
- * v9fs_vfs_readpages - read a set of pages from 9P
- *
- * @filp: file being read
- * @mapping: the address space
- * @pages: list of pages to read
- * @nr_pages: count of pages to read
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ * @file: file being read
+ * @page: structure to page
*
*/
-
-static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static int v9fs_vfs_readpage(struct file *file, struct page *page)
{
- int ret = 0;
- struct inode *inode;
-
- inode = mapping->host;
- p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+ struct folio *folio = page_folio(page);
- ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
- if (ret == 0)
- return ret;
+ return netfs_readpage(file, folio, &v9fs_req_ops, NULL);
+}
- ret = read_cache_pages(mapping, pages, v9fs_fid_readpage,
- filp->private_data);
- p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
- return ret;
+/**
+ * v9fs_vfs_readahead - read a set of pages from 9P
+ * @ractl: The readahead parameters
+ */
+static void v9fs_vfs_readahead(struct readahead_control *ractl)
+{
+ netfs_readahead(ractl, &v9fs_req_ops, NULL);
}
/**
@@ -123,9 +132,18 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
static int v9fs_release_page(struct page *page, gfp_t gfp)
{
- if (PagePrivate(page))
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_private(folio))
return 0;
- return v9fs_fscache_release_page(page, gfp);
+#ifdef CONFIG_9P_FSCACHE
+ if (folio_test_fscache(folio)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
+ return 0;
+ folio_wait_fscache(folio);
+ }
+#endif
+ return 1;
}
/**
@@ -138,63 +156,58 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
static void v9fs_invalidate_page(struct page *page, unsigned int offset,
unsigned int length)
{
- /*
- * If called with zero offset, we should release
- * the private state assocated with the page
- */
- if (offset == 0 && length == PAGE_SIZE)
- v9fs_fscache_invalidate_page(page);
+ struct folio *folio = page_folio(page);
+
+ folio_wait_fscache(folio);
}
-static int v9fs_vfs_writepage_locked(struct page *page)
+static int v9fs_vfs_write_folio_locked(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio_inode(folio);
struct v9fs_inode *v9inode = V9FS_I(inode);
- loff_t size = i_size_read(inode);
+ loff_t start = folio_pos(folio);
+ loff_t i_size = i_size_read(inode);
struct iov_iter from;
- struct bio_vec bvec;
- int err, len;
+ size_t len = folio_size(folio);
+ int err;
- if (page->index == size >> PAGE_SHIFT)
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
+ if (start >= i_size)
+ return 0; /* Simultaneous truncation occurred */
- bvec.bv_page = page;
- bvec.bv_offset = 0;
- bvec.bv_len = len;
- iov_iter_bvec(&from, WRITE, &bvec, 1, len);
+ len = min_t(loff_t, i_size - start, len);
+
+ iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
- set_page_writeback(page);
+ folio_start_writeback(folio);
- p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err);
+ p9_client_write(v9inode->writeback_fid, start, &from, &err);
- end_page_writeback(page);
+ folio_end_writeback(folio);
return err;
}
static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int retval;
- p9_debug(P9_DEBUG_VFS, "page %p\n", page);
+ p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
- retval = v9fs_vfs_writepage_locked(page);
+ retval = v9fs_vfs_write_folio_locked(folio);
if (retval < 0) {
if (retval == -EAGAIN) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
retval = 0;
} else {
- SetPageError(page);
- mapping_set_error(page->mapping, retval);
+ mapping_set_error(folio_mapping(folio), retval);
}
} else
retval = 0;
- unlock_page(page);
+ folio_unlock(folio);
return retval;
}
@@ -207,15 +220,15 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
static int v9fs_launder_page(struct page *page)
{
+ struct folio *folio = page_folio(page);
int retval;
- struct inode *inode = page->mapping->host;
- v9fs_fscache_wait_on_page_write(inode, page);
- if (clear_page_dirty_for_io(page)) {
- retval = v9fs_vfs_writepage_locked(page);
+ if (folio_clear_dirty_for_io(folio)) {
+ retval = v9fs_vfs_write_folio_locked(folio);
if (retval)
return retval;
}
+ folio_wait_fscache(folio);
return 0;
}
@@ -242,11 +255,13 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t pos = iocb->ki_pos;
ssize_t n;
int err = 0;
+
if (iov_iter_rw(iter) == WRITE) {
n = p9_client_write(file->private_data, pos, iter, &err);
if (n) {
struct inode *inode = file_inode(file);
loff_t i_size = i_size_read(inode);
+
if (pos + n > i_size)
inode_add_bytes(inode, pos + n - i_size);
}
@@ -257,58 +272,49 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned int len, unsigned int flags,
+ struct page **subpagep, void **fsdata)
{
- int retval = 0;
- struct page *page;
- struct v9fs_inode *v9inode;
- pgoff_t index = pos >> PAGE_SHIFT;
- struct inode *inode = mapping->host;
-
+ int retval;
+ struct folio *folio;
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- v9inode = V9FS_I(inode);
-start:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page) {
- retval = -ENOMEM;
- goto out;
- }
BUG_ON(!v9inode->writeback_fid);
- if (PageUptodate(page))
- goto out;
- if (len == PAGE_SIZE)
- goto out;
+ /* Prefetch area to be written into the cache if we're caching this
+ * file. We need to do this before we get a lock on the page in case
+ * there's more than one writer competing for the same cache block.
+ */
+ retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata,
+ &v9fs_req_ops, NULL);
+ if (retval < 0)
+ return retval;
- retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
- put_page(page);
- if (!retval)
- goto start;
-out:
- *pagep = page;
+ *subpagep = &folio->page;
return retval;
}
static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *subpage, void *fsdata)
{
loff_t last_pos = pos + copied;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(subpage);
+ struct inode *inode = mapping->host;
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (unlikely(copied < len)) {
copied = 0;
goto out;
- } else if (len == PAGE_SIZE) {
- SetPageUptodate(page);
}
+
+ folio_mark_uptodate(folio);
}
+
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold the i_mutex.
@@ -317,10 +323,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
inode_add_bytes(inode, last_pos - inode->i_size);
i_size_write(inode, last_pos);
}
- set_page_dirty(page);
+ folio_mark_dirty(folio);
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -328,7 +334,7 @@ out:
const struct address_space_operations v9fs_addr_operations = {
.readpage = v9fs_vfs_readpage,
- .readpages = v9fs_vfs_readpages,
+ .readahead = v9fs_vfs_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 4b4292123b3d..1c609e99d280 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_dentry.c
- *
* This file contians vfs dentry ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -52,6 +50,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
static void v9fs_dentry_release(struct dentry *dentry)
{
struct hlist_node *p, *n;
+
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
@@ -76,6 +75,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
int retval;
struct v9fs_session_info *v9ses;
+
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b6a5a0be444d..8c854d8cb0cd 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_dir.c
- *
* This file contains vfs directory ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -71,6 +69,7 @@ static inline int dt_type(struct p9_wstat *mistat)
static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
{
struct p9_fid *fid = filp->private_data;
+
if (!fid->rdir)
fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
return fid->rdir;
@@ -108,6 +107,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (rdir->tail == rdir->head) {
struct iov_iter to;
int n;
+
iov_iter_kvec(&to, READ, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
@@ -233,5 +233,5 @@ const struct file_operations v9fs_dir_operations_dotl = {
.iterate_shared = v9fs_dir_readdir_dotl,
.open = v9fs_file_open,
.release = v9fs_dir_release,
- .fsync = v9fs_file_fsync_dotl,
+ .fsync = v9fs_file_fsync_dotl,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 246235ebdb70..612e297f3763 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_file.c
- *
* This file contians vfs file ops for 9P2000.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -408,6 +406,7 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
loff_t i_size;
unsigned long pg_start, pg_end;
+
pg_start = origin >> PAGE_SHIFT;
pg_end = (origin + retval - 1) >> PAGE_SHIFT;
if (inode->i_mapping && inode->i_mapping->nrpages)
@@ -529,29 +528,38 @@ static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
struct v9fs_inode *v9inode;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct file *filp = vmf->vma->vm_file;
struct inode *inode = file_inode(filp);
- p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
- page, (unsigned long)filp->private_data);
+ p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
+ folio, (unsigned long)filp->private_data);
+
+ v9inode = V9FS_I(inode);
+
+ /* Wait for the page to be written to the cache before we allow it to
+ * be modified. We then assume the entire page will need writing back.
+ */
+#ifdef CONFIG_9P_FSCACHE
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0)
+ return VM_FAULT_NOPAGE;
+#endif
/* Update file times before taking page lock */
file_update_time(filp);
- v9inode = V9FS_I(inode);
- /* make sure the cache has finished storing the page */
- v9fs_fscache_wait_on_page_write(inode, page);
BUG_ON(!v9inode->writeback_fid);
- lock_page(page);
- if (page->mapping != inode->i_mapping)
+ if (folio_lock_killable(folio) < 0)
+ return VM_FAULT_RETRY;
+ if (folio_mapping(folio) != inode->i_mapping)
goto out_unlock;
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
return VM_FAULT_LOCKED;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return VM_FAULT_NOPAGE;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 08f48b70a741..328c338ff304 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_inode.c
- *
* This file contains vfs inode ops for the 9P2000 protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -49,6 +47,7 @@ static const struct inode_operations v9fs_symlink_inode_operations;
static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
{
int res;
+
res = mode & 0777;
if (S_ISDIR(mode))
res |= P9_DMDIR;
@@ -110,7 +109,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses,
static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
struct p9_wstat *stat, dev_t *rdev)
{
- int res;
+ int res, r;
u32 mode = stat->mode;
*rdev = 0;
@@ -128,11 +127,16 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
res |= S_IFIFO;
else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
&& (v9ses->nodev == 0)) {
- char type = 0, ext[32];
+ char type = 0;
int major = -1, minor = -1;
- strlcpy(ext, stat->extension, sizeof(ext));
- sscanf(ext, "%c %i %i", &type, &major, &minor);
+ r = sscanf(stat->extension, "%c %i %i", &type, &major, &minor);
+ if (r != 3) {
+ p9_debug(P9_DEBUG_ERROR,
+ "invalid device string, umode will be bogus: %s\n",
+ stat->extension);
+ return res;
+ }
switch (type) {
case 'c':
res |= S_IFCHR;
@@ -223,6 +227,7 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
+
v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
@@ -251,7 +256,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
{
int err = 0;
- inode_init_owner(&init_user_ns,inode, NULL, mode);
+ inode_init_owner(&init_user_ns, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -440,7 +445,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *, void *);
+ int (*test)(struct inode *inode, void *data);
if (new)
test = v9fs_test_new_inode;
@@ -499,8 +504,10 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
static int v9fs_at_to_dotl_flags(int flags)
{
int rflags = 0;
+
if (flags & AT_REMOVEDIR)
rflags |= P9_DOTL_AT_REMOVEDIR;
+
return rflags;
}
@@ -797,7 +804,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
static int
v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t mode)
+ struct file *file, unsigned int flags, umode_t mode)
{
int err;
u32 perm;
@@ -1084,7 +1091,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
fid = v9fs_fid_lookup(dentry);
use_dentry = 1;
}
- if(IS_ERR(fid))
+ if (IS_ERR(fid))
return PTR_ERR(fid);
v9fs_blank_wstat(&wstat);
@@ -1364,7 +1371,7 @@ v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1];
u32 perm;
- p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n",
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 01b9e1281a29..7dee89ba32e7 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_inode_dotl.c
- *
* This file contains vfs inode ops for the 9P2000.L protocol.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
@@ -107,7 +105,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
unsigned long i_ino;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
- int (*test)(struct inode *, void *);
+ int (*test)(struct inode *inode, void *data);
if (new)
test = v9fs_test_new_inode_dotl;
@@ -230,7 +228,7 @@ v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
static int
v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t omode)
+ struct file *file, unsigned int flags, umode_t omode)
{
int err = 0;
kgid_t gid;
@@ -261,7 +259,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
name = dentry->d_name.name;
- p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
+ p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n",
name, flags, omode);
dfid = v9fs_parent_fid(dentry);
@@ -807,6 +805,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
/* Get the latest stat info from server. */
struct p9_fid *fid;
+
fid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -843,7 +842,7 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
struct p9_qid qid;
struct posix_acl *dacl = NULL, *pacl = NULL;
- p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
+ p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n",
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 5fce6e30bc5a..b739e02f5ef7 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -1,9 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/vfs_super.c
- *
- * This file contians superblock ops for 9P2000. It is intended that
- * you mount this file system on directories.
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
@@ -83,6 +79,9 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (!v9ses->cache) {
sb->s_bdi->ra_pages = 0;
sb->s_bdi->io_pages = 0;
+ } else {
+ sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT;
+ sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT;
}
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
@@ -113,7 +112,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- umode_t mode = S_IRWXUGO | S_ISVTX;
+ umode_t mode = 0777 | S_ISVTX;
struct p9_fid *fid;
int retval = 0;
@@ -157,6 +156,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
sb->s_root = root;
if (v9fs_proto_dotl(v9ses)) {
struct p9_stat_dotl *st = NULL;
+
st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
if (IS_ERR(st)) {
retval = PTR_ERR(st);
@@ -167,6 +167,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
kfree(st);
} else {
struct p9_wstat *st = NULL;
+
st = p9_client_stat(fid);
if (IS_ERR(st)) {
retval = PTR_ERR(st);
@@ -275,12 +276,13 @@ done:
static int v9fs_drop_inode(struct inode *inode)
{
struct v9fs_session_info *v9ses;
+
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
return generic_drop_inode(inode);
/*
* in case of non cached mode always drop the
- * the inode because we want the inode attribute
+ * inode because we want the inode attribute
* to always match that on the server.
*/
return 1;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index ee331845e2c7..a824441b95a2 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/module.h>
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index c63c3bea5de5..3e11fc3331eb 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#ifndef FS_9P_XATTR_H
#define FS_9P_XATTR_H
@@ -22,13 +14,14 @@ extern const struct xattr_handler *v9fs_xattr_handlers[];
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
-extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
- void *, size_t);
-extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
- void *, size_t);
-extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
- const void *, size_t, int);
-extern int v9fs_xattr_set(struct dentry *, const char *,
- const void *, size_t, int);
-extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
+ void *buffer, size_t buffer_size);
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t buffer_size);
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+ const void *value, size_t value_len, int flags);
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t value_len, int flags);
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
#endif /* FS_9P_XATTR_H */
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c6c2a513ec92..c609005a9eaa 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ size = bdev_nr_sectors(sb->s_bdev);
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 4579bbda4634..da9b4f8577a1 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -103,13 +103,13 @@ struct afs_lookup_cookie {
};
/*
- * Drop the refs that we're holding on the pages we were reading into. We've
+ * Drop the refs that we're holding on the folios we were reading into. We've
* got refs on the first nr_pages pages.
*/
static void afs_dir_read_cleanup(struct afs_read *req)
{
struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t last = req->nr_pages - 1;
XA_STATE(xas, &mapping->i_pages, 0);
@@ -118,65 +118,56 @@ static void afs_dir_read_cleanup(struct afs_read *req)
return;
rcu_read_lock();
- xas_for_each(&xas, page, last) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, last) {
+ if (xas_retry(&xas, folio))
continue;
- BUG_ON(xa_is_value(page));
- BUG_ON(PageCompound(page));
- ASSERTCMP(page->mapping, ==, mapping);
+ BUG_ON(xa_is_value(folio));
+ ASSERTCMP(folio_file_mapping(folio), ==, mapping);
- put_page(page);
+ folio_put(folio);
}
rcu_read_unlock();
}
/*
- * check that a directory page is valid
+ * check that a directory folio is valid
*/
-static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
- loff_t i_size)
+static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
+ loff_t i_size)
{
- struct afs_xdr_dir_page *dbuf;
- loff_t latter, off;
- int tmp, qty;
+ union afs_xdr_dir_block *block;
+ size_t offset, size;
+ loff_t pos;
- /* Determine how many magic numbers there should be in this page, but
+ /* Determine how many magic numbers there should be in this folio, but
* we must take care because the directory may change size under us.
*/
- off = page_offset(page);
- if (i_size <= off)
+ pos = folio_pos(folio);
+ if (i_size <= pos)
goto checked;
- latter = i_size - off;
- if (latter >= PAGE_SIZE)
- qty = PAGE_SIZE;
- else
- qty = latter;
- qty /= sizeof(union afs_xdr_dir_block);
-
- /* check them */
- dbuf = kmap_atomic(page);
- for (tmp = 0; tmp < qty; tmp++) {
- if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) {
- printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
- __func__, dvnode->vfs_inode.i_ino, tmp, qty,
- ntohs(dbuf->blocks[tmp].hdr.magic));
- trace_afs_dir_check_failed(dvnode, off, i_size);
- kunmap(page);
+ size = min_t(loff_t, folio_size(folio), i_size - pos);
+ for (offset = 0; offset < size; offset += sizeof(*block)) {
+ block = kmap_local_folio(folio, offset);
+ if (block->hdr.magic != AFS_DIR_MAGIC) {
+ printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
+ __func__, dvnode->vfs_inode.i_ino,
+ pos, offset, size, ntohs(block->hdr.magic));
+ trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
+ kunmap_local(block);
trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
goto error;
}
/* Make sure each block is NUL terminated so we can reasonably
- * use string functions on it. The filenames in the page
+ * use string functions on it. The filenames in the folio
* *should* be NUL-terminated anyway.
*/
- ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0;
- }
-
- kunmap_atomic(dbuf);
+ ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0;
+ kunmap_local(block);
+ }
checked:
afs_stat_v(dvnode, n_read_dir);
return true;
@@ -190,11 +181,11 @@ error:
*/
static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
- struct afs_xdr_dir_page *dbuf;
+ union afs_xdr_dir_block *block;
struct address_space *mapping = dvnode->vfs_inode.i_mapping;
- struct page *page;
- unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+ struct folio *folio;
pgoff_t last = req->nr_pages - 1;
+ size_t offset, size;
XA_STATE(xas, &mapping->i_pages, 0);
@@ -205,30 +196,28 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
req->pos, req->nr_pages,
req->iter->iov_offset, iov_iter_count(req->iter));
- xas_for_each(&xas, page, last) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, last) {
+ if (xas_retry(&xas, folio))
continue;
- BUG_ON(PageCompound(page));
- BUG_ON(page->mapping != mapping);
-
- dbuf = kmap_atomic(page);
- for (i = 0; i < qty; i++) {
- union afs_xdr_dir_block *block = &dbuf->blocks[i];
+ BUG_ON(folio_file_mapping(folio) != mapping);
- pr_warn("[%02lx] %32phN\n", page->index * qty + i, block);
+ size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
+ for (offset = 0; offset < size; offset += sizeof(*block)) {
+ block = kmap_local_folio(folio, offset);
+ pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+ kunmap_local(block);
}
- kunmap_atomic(dbuf);
}
}
/*
- * Check all the pages in a directory. All the pages are held pinned.
+ * Check all the blocks in a directory. All the folios are held pinned.
*/
static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
{
struct address_space *mapping = dvnode->vfs_inode.i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t last = req->nr_pages - 1;
int ret = 0;
@@ -238,14 +227,13 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
return 0;
rcu_read_lock();
- xas_for_each(&xas, page, last) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, last) {
+ if (xas_retry(&xas, folio))
continue;
- BUG_ON(PageCompound(page));
- BUG_ON(page->mapping != mapping);
+ BUG_ON(folio_file_mapping(folio) != mapping);
- if (!afs_dir_check_page(dvnode, page, req->file_size)) {
+ if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
afs_dir_dump(dvnode, req);
ret = -EIO;
break;
@@ -274,15 +262,16 @@ static int afs_dir_open(struct inode *inode, struct file *file)
/*
* Read the directory into the pagecache in one go, scrubbing the previous
- * contents. The list of pages is returned, pinning them so that they don't
+ * contents. The list of folios is returned, pinning them so that they don't
* get reclaimed during the iteration.
*/
static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
__acquires(&dvnode->validate_lock)
{
+ struct address_space *mapping = dvnode->vfs_inode.i_mapping;
struct afs_read *req;
loff_t i_size;
- int nr_pages, i, n;
+ int nr_pages, i;
int ret;
_enter("");
@@ -320,43 +309,30 @@ expand:
req->iter = &req->def_iter;
/* Fill in any gaps that we might find where the memory reclaimer has
- * been at work and pin all the pages. If there are any gaps, we will
+ * been at work and pin all the folios. If there are any gaps, we will
* need to reread the entire directory contents.
*/
i = req->nr_pages;
while (i < nr_pages) {
- struct page *pages[8], *page;
-
- n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i,
- min_t(unsigned int, nr_pages - i,
- ARRAY_SIZE(pages)),
- pages);
- _debug("find %u at %u/%u", n, i, nr_pages);
-
- if (n == 0) {
- gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask;
+ struct folio *folio;
+ folio = filemap_get_folio(mapping, i);
+ if (!folio) {
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
ret = -ENOMEM;
- page = __page_cache_alloc(gfp);
- if (!page)
+ folio = __filemap_get_folio(mapping,
+ i, FGP_LOCK | FGP_CREAT,
+ mapping->gfp_mask);
+ if (!folio)
goto error;
- ret = add_to_page_cache_lru(page,
- dvnode->vfs_inode.i_mapping,
- i, gfp);
- if (ret < 0)
- goto error;
-
- attach_page_private(page, (void *)1);
- unlock_page(page);
- req->nr_pages++;
- i++;
- } else {
- req->nr_pages += n;
- i += n;
+ folio_attach_private(folio, (void *)1);
+ folio_unlock(folio);
}
+
+ req->nr_pages += folio_nr_pages(folio);
+ i += folio_nr_pages(folio);
}
/* If we're going to reload, we need to lock all the pages to prevent
@@ -424,7 +400,7 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
size_t nlen;
int tmp;
- _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
+ _enter("%llx,%x", ctx->pos, blkoff);
curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent);
@@ -513,12 +489,10 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
struct key *key, afs_dataversion_t *_dir_version)
{
struct afs_vnode *dvnode = AFS_FS_I(dir);
- struct afs_xdr_dir_page *dbuf;
union afs_xdr_dir_block *dblock;
struct afs_read *req;
- struct page *page;
- unsigned blkoff, limit;
- void __rcu **slot;
+ struct folio *folio;
+ unsigned offset, size;
int ret;
_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
@@ -540,43 +514,30 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
/* walk through the blocks in sequence */
ret = 0;
while (ctx->pos < req->actual_len) {
- blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1);
-
- /* Fetch the appropriate page from the directory and re-add it
+ /* Fetch the appropriate folio from the directory and re-add it
* to the LRU. We have all the pages pinned with an extra ref.
*/
- rcu_read_lock();
- page = NULL;
- slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages,
- blkoff / PAGE_SIZE);
- if (slot)
- page = radix_tree_deref_slot(slot);
- rcu_read_unlock();
- if (!page) {
+ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
+ FGP_ACCESSED, 0);
+ if (!folio) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
- mark_page_accessed(page);
- limit = blkoff & ~(PAGE_SIZE - 1);
+ offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio);
+ size = min_t(loff_t, folio_size(folio),
+ req->actual_len - folio_file_pos(folio));
- dbuf = kmap(page);
-
- /* deal with the individual blocks stashed on this page */
do {
- dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
- sizeof(union afs_xdr_dir_block)];
- ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff);
- if (ret != 1) {
- kunmap(page);
+ dblock = kmap_local_folio(folio, offset);
+ ret = afs_dir_iterate_block(dvnode, ctx, dblock,
+ folio_file_pos(folio) + offset);
+ kunmap_local(dblock);
+ if (ret != 1)
goto out;
- }
- blkoff += sizeof(union afs_xdr_dir_block);
+ } while (offset += sizeof(*dblock), offset < size);
- } while (ctx->pos < dir->i_size && blkoff < limit);
-
- kunmap(page);
ret = 0;
}
@@ -2037,42 +1998,42 @@ error:
}
/*
- * Release a directory page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
+ * Release a directory folio and clean up its private state if it's not busy
+ * - return true if the folio can now be released, false if not
*/
-static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)
+static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
{
- struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
+ struct folio *folio = page_folio(subpage);
+ struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
- detach_page_private(page);
+ folio_detach_private(folio);
/* The directory will need reloading. */
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_relpg);
- return 1;
+ return true;
}
/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- * the entire page)
+ * Invalidate part or all of a folio.
*/
-static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
+static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset,
unsigned int length)
{
- struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
+ struct folio *folio = page_folio(subpage);
+ struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{%lu},%u,%u", page->index, offset, length);
+ _enter("{%lu},%u,%u", folio_index(folio), offset, length);
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
/* The directory will need reloading. */
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == thp_size(page))
- detach_page_private(page);
+ /* we clean up only if the entire folio is being invalidated */
+ if (offset == 0 && length == folio_size(folio))
+ folio_detach_private(folio);
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 540b9fc96824..d98e109ecee9 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -105,6 +105,25 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
}
/*
+ * Get a new directory folio.
+ */
+static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
+{
+ struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct folio *folio;
+
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping->gfp_mask);
+ if (!folio)
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ else if (folio && !folio_test_private(folio))
+ folio_attach_private(folio, (void *)1);
+
+ return folio;
+}
+
+/*
* Scan a directory block looking for a dirent of the right name.
*/
static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
@@ -188,13 +207,11 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *meta, *block;
- struct afs_xdr_dir_page *meta_page, *dir_page;
union afs_xdr_dirent *de;
- struct page *page0, *page;
+ struct folio *folio0, *folio;
unsigned int need_slots, nr_blocks, b;
pgoff_t index;
loff_t i_size;
- gfp_t gfp;
int slot;
_enter(",,{%d,%s},", name->len, name->name);
@@ -206,10 +223,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
return;
}
- gfp = vnode->vfs_inode.i_mapping->gfp_mask;
- page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp);
- if (!page0) {
- clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ folio0 = afs_dir_get_folio(vnode, 0);
+ if (!folio0) {
_leave(" [fgp]");
return;
}
@@ -217,42 +232,35 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
/* Work out how many slots we're going to need. */
need_slots = afs_dir_calc_slots(name->len);
- meta_page = kmap(page0);
- meta = &meta_page->blocks[0];
+ meta = kmap_local_folio(folio0, 0);
if (i_size == 0)
goto new_directory;
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
- /* Find a block that has sufficient slots available. Each VM page
+ /* Find a block that has sufficient slots available. Each folio
* contains two or more directory blocks.
*/
for (b = 0; b < nr_blocks + 1; b++) {
- /* If the directory extended into a new page, then we need to
- * tack a new page on the end.
+ /* If the directory extended into a new folio, then we need to
+ * tack a new folio on the end.
*/
index = b / AFS_DIR_BLOCKS_PER_PAGE;
- if (index == 0) {
- page = page0;
- dir_page = meta_page;
- } else {
- if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
- goto error;
- gfp = vnode->vfs_inode.i_mapping->gfp_mask;
- page = find_or_create_page(vnode->vfs_inode.i_mapping,
- index, gfp);
- if (!page)
+ if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
+ goto error;
+ if (index >= folio_nr_pages(folio0)) {
+ folio = afs_dir_get_folio(vnode, index);
+ if (!folio)
goto error;
- if (!PagePrivate(page))
- attach_page_private(page, (void *)1);
- dir_page = kmap(page);
+ } else {
+ folio = folio0;
}
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated;
- block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE];
-
_debug("block %u: %2u %3u %u",
b,
(b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99,
@@ -266,7 +274,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
}
- /* Only lower dir pages have a counter in the header. */
+ /* Only lower dir blocks have a counter in the header. */
if (b >= AFS_DIR_BLOCKS_WITH_CTR ||
meta->meta.alloc_ctrs[b] >= need_slots) {
/* We need to try and find one or more consecutive
@@ -279,10 +287,10 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
}
}
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -298,8 +306,8 @@ new_directory:
i_size = AFS_DIR_BLOCK_SIZE;
afs_set_i_size(vnode, i_size);
slot = AFS_DIR_RESV_BLOCKS0;
- page = page0;
- block = meta;
+ folio = folio0;
+ block = kmap_local_folio(folio, 0);
nr_blocks = 1;
b = 0;
@@ -318,10 +326,10 @@ found_space:
/* Adjust the bitmap. */
afs_set_contig_bits(block, slot, need_slots);
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
/* Adjust the allocation counter. */
@@ -333,18 +341,19 @@ found_space:
_debug("Insert %s in %u[%u]", name->name, b, slot);
out_unmap:
- unlock_page(page0);
- kunmap(page0);
- put_page(page0);
+ kunmap_local(meta);
+ folio_unlock(folio0);
+ folio_put(folio0);
_leave("");
return;
invalidated:
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
- if (page != page0) {
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
goto out_unmap;
@@ -364,10 +373,9 @@ error:
void afs_edit_dir_remove(struct afs_vnode *vnode,
struct qstr *name, enum afs_edit_dir_reason why)
{
- struct afs_xdr_dir_page *meta_page, *dir_page;
union afs_xdr_dir_block *meta, *block;
union afs_xdr_dirent *de;
- struct page *page0, *page;
+ struct folio *folio0, *folio;
unsigned int need_slots, nr_blocks, b;
pgoff_t index;
loff_t i_size;
@@ -384,9 +392,8 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
}
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
- page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0);
- if (!page0) {
- clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ folio0 = afs_dir_get_folio(vnode, 0);
+ if (!folio0) {
_leave(" [fgp]");
return;
}
@@ -394,30 +401,27 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
/* Work out how many slots we're going to discard. */
need_slots = afs_dir_calc_slots(name->len);
- meta_page = kmap(page0);
- meta = &meta_page->blocks[0];
+ meta = kmap_local_folio(folio0, 0);
- /* Find a page that has sufficient slots available. Each VM page
+ /* Find a block that has sufficient slots available. Each folio
* contains two or more directory blocks.
*/
for (b = 0; b < nr_blocks; b++) {
index = b / AFS_DIR_BLOCKS_PER_PAGE;
- if (index != 0) {
- page = find_lock_page(vnode->vfs_inode.i_mapping, index);
- if (!page)
+ if (index >= folio_nr_pages(folio0)) {
+ folio = afs_dir_get_folio(vnode, index);
+ if (!folio)
goto error;
- dir_page = kmap(page);
} else {
- page = page0;
- dir_page = meta_page;
+ folio = folio0;
}
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated;
- block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE];
-
if (b > AFS_DIR_BLOCKS_WITH_CTR ||
meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
slot = afs_dir_scan_block(block, name, b);
@@ -425,10 +429,10 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
goto found_dirent;
}
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -449,10 +453,10 @@ found_dirent:
/* Adjust the bitmap. */
afs_clear_contig_bits(block, slot, need_slots);
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
/* Adjust the allocation counter. */
@@ -464,9 +468,9 @@ found_dirent:
_debug("Remove %s from %u[%u]", name->name, b, slot);
out_unmap:
- unlock_page(page0);
- kunmap(page0);
- put_page(page0);
+ kunmap_local(meta);
+ folio_unlock(folio0);
+ folio_put(folio0);
_leave("");
return;
@@ -474,10 +478,10 @@ invalidated:
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
- if (page != page0) {
- unlock_page(page);
- kunmap(page);
- put_page(page);
+ kunmap_local(block);
+ if (folio != folio0) {
+ folio_unlock(folio);
+ folio_put(folio);
}
goto out_unmap;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index e6c447ae91f3..cb6ad61eec3b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,6 +19,7 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_readpage(struct file *file, struct page *page);
+static int afs_symlink_readpage(struct file *file, struct page *page);
static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
@@ -49,7 +50,7 @@ const struct inode_operations afs_file_inode_operations = {
.permission = afs_permission,
};
-const struct address_space_operations afs_fs_aops = {
+const struct address_space_operations afs_file_aops = {
.readpage = afs_readpage,
.readahead = afs_readahead,
.set_page_dirty = afs_set_page_dirty,
@@ -62,6 +63,12 @@ const struct address_space_operations afs_fs_aops = {
.writepages = afs_writepages,
};
+const struct address_space_operations afs_symlink_aops = {
+ .readpage = afs_symlink_readpage,
+ .releasepage = afs_releasepage,
+ .invalidatepage = afs_invalidatepage,
+};
+
static const struct vm_operations_struct afs_vm_ops = {
.open = afs_vm_open,
.close = afs_vm_close,
@@ -313,25 +320,28 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
afs_put_read(fsreq);
}
-static int afs_symlink_readpage(struct page *page)
+static int afs_symlink_readpage(struct file *file, struct page *page)
{
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
struct afs_read *fsreq;
+ struct folio *folio = page_folio(page);
int ret;
fsreq = afs_alloc_read(GFP_NOFS);
if (!fsreq)
return -ENOMEM;
- fsreq->pos = page->index * PAGE_SIZE;
- fsreq->len = PAGE_SIZE;
+ fsreq->pos = folio_pos(folio);
+ fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
- page_endio(page, false, ret);
+ if (ret == 0)
+ SetPageUptodate(page);
+ unlock_page(page);
return ret;
}
@@ -355,7 +365,7 @@ static int afs_begin_cache_operation(struct netfs_read_request *rreq)
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
- struct page *page, void **_fsdata)
+ struct folio *folio, void **_fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
@@ -378,10 +388,9 @@ const struct netfs_read_request_ops afs_req_ops = {
static int afs_readpage(struct file *file, struct page *page)
{
- if (!file)
- return afs_symlink_readpage(page);
+ struct folio *folio = page_folio(page);
- return netfs_readpage(file, page, &afs_req_ops, NULL);
+ return netfs_readpage(file, folio, &afs_req_ops, NULL);
}
static void afs_readahead(struct readahead_control *ractl)
@@ -393,29 +402,29 @@ static void afs_readahead(struct readahead_control *ractl)
* Adjust the dirty region of the page on truncation or full invalidation,
* getting rid of the markers altogether if the region is entirely invalidated.
*/
-static void afs_invalidate_dirty(struct page *page, unsigned int offset,
+static void afs_invalidate_dirty(struct folio *folio, unsigned int offset,
unsigned int length)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
unsigned long priv;
unsigned int f, t, end = offset + length;
- priv = page_private(page);
+ priv = (unsigned long)folio_get_private(folio);
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == thp_size(page))
+ if (offset == 0 && length == folio_size(folio))
goto full_invalidate;
/* If the page was dirtied by page_mkwrite(), the PTE stays writable
* and we don't get another notification to tell us to expand it
* again.
*/
- if (afs_is_page_dirty_mmapped(priv))
+ if (afs_is_folio_dirty_mmapped(priv))
return;
/* We may need to shorten the dirty region */
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
if (t <= offset || f >= end)
return; /* Doesn't overlap */
@@ -433,17 +442,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset,
if (f == t)
goto undirty;
- priv = afs_page_dirty(page, f, t);
- set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page);
+ priv = afs_folio_dirty(folio, f, t);
+ folio_change_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio);
return;
undirty:
- trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page);
- clear_page_dirty_for_io(page);
+ trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio);
+ folio_clear_dirty_for_io(folio);
full_invalidate:
- trace_afs_page_dirty(vnode, tracepoint_string("inval"), page);
- detach_page_private(page);
+ trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio);
+ folio_detach_private(folio);
}
/*
@@ -454,14 +463,16 @@ full_invalidate:
static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- _enter("{%lu},%u,%u", page->index, offset, length);
+ struct folio *folio = page_folio(page);
+
+ _enter("{%lu},%u,%u", folio_index(folio), offset, length);
BUG_ON(!PageLocked(page));
if (PagePrivate(page))
- afs_invalidate_dirty(page, offset, length);
+ afs_invalidate_dirty(folio, offset, length);
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
_leave("");
}
@@ -471,30 +482,31 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
*/
static int afs_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct folio *folio = page_folio(page);
+ struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu],%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
+ vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
gfp_flags);
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS))
return false;
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
#endif
- if (PagePrivate(page)) {
- trace_afs_page_dirty(vnode, tracepoint_string("rel"), page);
- detach_page_private(page);
+ if (folio_test_private(folio)) {
+ trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio);
+ folio_detach_private(folio);
}
- /* indicate that the page can be released */
+ /* Indicate that the folio can be released */
_leave(" = T");
- return 1;
+ return true;
}
static void afs_add_open_mmap(struct afs_vnode *vnode)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 8fcffea2daf5..16906eb592d9 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -95,7 +95,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFREG | (status->mode & S_IALLUGO);
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_file_aops;
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
@@ -113,11 +113,11 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFDIR | 0555;
inode->i_op = &afs_mntpt_inode_operations;
inode->i_fop = &afs_mntpt_file_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
} else {
inode->i_mode = S_IFLNK | status->mode;
inode->i_op = &afs_symlink_inode_operations;
- inode->i_mapping->a_ops = &afs_fs_aops;
+ inode->i_mapping->a_ops = &afs_symlink_aops;
}
inode_nohighmem(inode);
break;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0ad97a8fc0d4..aa4c0d6c9780 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -876,59 +876,59 @@ struct afs_vnode_cache_aux {
} __packed;
/*
- * We use page->private to hold the amount of the page that we've written to,
+ * We use folio->private to hold the amount of the folio that we've written to,
* splitting the field into two parts. However, we need to represent a range
- * 0...PAGE_SIZE, so we reduce the resolution if the size of the page
+ * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio
* exceeds what we can encode.
*/
#ifdef CONFIG_64BIT
-#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL
-#define __AFS_PAGE_PRIV_SHIFT 32
-#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL
+#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL
+#define __AFS_FOLIO_PRIV_SHIFT 32
+#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL
#else
-#define __AFS_PAGE_PRIV_MASK 0x7fffUL
-#define __AFS_PAGE_PRIV_SHIFT 16
-#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL
+#define __AFS_FOLIO_PRIV_MASK 0x7fffUL
+#define __AFS_FOLIO_PRIV_SHIFT 16
+#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL
#endif
-static inline unsigned int afs_page_dirty_resolution(struct page *page)
+static inline unsigned int afs_folio_dirty_resolution(struct folio *folio)
{
- int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+ int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1);
return (shift > 0) ? shift : 0;
}
-static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv)
+static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv)
{
- unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
+ unsigned long x = priv & __AFS_FOLIO_PRIV_MASK;
/* The lower bound is inclusive */
- return x << afs_page_dirty_resolution(page);
+ return x << afs_folio_dirty_resolution(folio);
}
-static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv)
+static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv)
{
- unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK;
+ unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK;
/* The upper bound is immediately beyond the region */
- return (x + 1) << afs_page_dirty_resolution(page);
+ return (x + 1) << afs_folio_dirty_resolution(folio);
}
-static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to)
+static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to)
{
- unsigned int res = afs_page_dirty_resolution(page);
+ unsigned int res = afs_folio_dirty_resolution(folio);
from >>= res;
to = (to - 1) >> res;
- return (to << __AFS_PAGE_PRIV_SHIFT) | from;
+ return (to << __AFS_FOLIO_PRIV_SHIFT) | from;
}
-static inline unsigned long afs_page_dirty_mmapped(unsigned long priv)
+static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv)
{
- return priv | __AFS_PAGE_PRIV_MMAPPED;
+ return priv | __AFS_FOLIO_PRIV_MMAPPED;
}
-static inline bool afs_is_page_dirty_mmapped(unsigned long priv)
+static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
{
- return priv & __AFS_PAGE_PRIV_MMAPPED;
+ return priv & __AFS_FOLIO_PRIV_MMAPPED;
}
#include <trace/events/afs.h>
@@ -1055,7 +1055,8 @@ extern void afs_dynroot_depopulate(struct super_block *);
/*
* file.c
*/
-extern const struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_file_aops;
+extern const struct address_space_operations afs_symlink_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
extern const struct netfs_read_request_ops afs_req_ops;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f24370f5c774..ca4909baf5e6 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -32,7 +32,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
struct page **_page, void **fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct page *page;
+ struct folio *folio;
unsigned long priv;
unsigned f, from;
unsigned t, to;
@@ -46,12 +46,12 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata,
+ ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata,
&afs_req_ops, NULL);
if (ret < 0)
return ret;
- index = page->index;
+ index = folio_index(folio);
from = pos - index * PAGE_SIZE;
to = from + len;
@@ -59,14 +59,14 @@ try_again:
/* See if this page is already partially written in a way that we can
* merge the new write with.
*/
- if (PagePrivate(page)) {
- priv = page_private(page);
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ if (folio_test_private(folio)) {
+ priv = (unsigned long)folio_get_private(folio);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
ASSERTCMP(f, <=, t);
- if (PageWriteback(page)) {
- trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page);
+ if (folio_test_writeback(folio)) {
+ trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
goto flush_conflicting_write;
}
/* If the file is being filled locally, allow inter-write
@@ -78,7 +78,7 @@ try_again:
goto flush_conflicting_write;
}
- *_page = page;
+ *_page = &folio->page;
_leave(" = 0");
return 0;
@@ -87,17 +87,17 @@ try_again:
*/
flush_conflicting_write:
_debug("flush conflict");
- ret = write_one_page(page);
+ ret = folio_write_one(folio);
if (ret < 0)
goto error;
- ret = lock_page_killable(page);
+ ret = folio_lock_killable(folio);
if (ret < 0)
goto error;
goto try_again;
error:
- put_page(page);
+ folio_put(folio);
_leave(" = %d", ret);
return ret;
}
@@ -107,24 +107,25 @@ error:
*/
int afs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct page *subpage, void *fsdata)
{
+ struct folio *folio = page_folio(subpage);
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
unsigned long priv;
- unsigned int f, from = pos & (thp_size(page) - 1);
+ unsigned int f, from = offset_in_folio(folio, pos);
unsigned int t, to = from + copied;
loff_t i_size, maybe_i_size;
_enter("{%llx:%llu},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, page->index);
+ vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (copied < len) {
copied = 0;
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
if (copied == 0)
@@ -141,29 +142,29 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_sequnlock(&vnode->cb_lock);
}
- if (PagePrivate(page)) {
- priv = page_private(page);
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ if (folio_test_private(folio)) {
+ priv = (unsigned long)folio_get_private(folio);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
if (from < f)
f = from;
if (to > t)
t = to;
- priv = afs_page_dirty(page, f, t);
- set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page);
+ priv = afs_folio_dirty(folio, f, t);
+ folio_change_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio);
} else {
- priv = afs_page_dirty(page, from, to);
- attach_page_private(page, (void *)priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page);
+ priv = afs_folio_dirty(folio, from, to);
+ folio_attach_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio);
}
- if (set_page_dirty(page))
- _debug("dirtied %lx", page->index);
+ if (folio_mark_dirty(folio))
+ _debug("dirtied %lx", folio_index(folio));
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -174,40 +175,32 @@ static void afs_kill_pages(struct address_space *mapping,
loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct pagevec pv;
- unsigned int loop, psize;
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
_enter("{%llx:%llu},%llx @%llx",
vnode->fid.vid, vnode->fid.vnode, len, start);
- pagevec_init(&pv);
-
do {
- _debug("kill %llx @%llx", len, start);
-
- pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
- PAGEVEC_SIZE, pv.pages);
- if (pv.nr == 0)
- break;
+ _debug("kill %lx (to %lx)", index, last);
- for (loop = 0; loop < pv.nr; loop++) {
- struct page *page = pv.pages[loop];
+ folio = filemap_get_folio(mapping, index);
+ if (!folio) {
+ next = index + 1;
+ continue;
+ }
- if (page->index * PAGE_SIZE >= start + len)
- break;
+ next = folio_next_index(folio);
- psize = thp_size(page);
- start += psize;
- len -= psize;
- ClearPageUptodate(page);
- end_page_writeback(page);
- lock_page(page);
- generic_error_remove_page(mapping, page);
- unlock_page(page);
- }
+ folio_clear_uptodate(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_page(mapping, &folio->page);
+ folio_unlock(folio);
+ folio_put(folio);
- __pagevec_release(&pv);
- } while (len > 0);
+ } while (index = next, index <= last);
_leave("");
}
@@ -220,37 +213,27 @@ static void afs_redirty_pages(struct writeback_control *wbc,
loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct pagevec pv;
- unsigned int loop, psize;
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
_enter("{%llx:%llu},%llx @%llx",
vnode->fid.vid, vnode->fid.vnode, len, start);
- pagevec_init(&pv);
-
do {
_debug("redirty %llx @%llx", len, start);
- pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
- PAGEVEC_SIZE, pv.pages);
- if (pv.nr == 0)
- break;
-
- for (loop = 0; loop < pv.nr; loop++) {
- struct page *page = pv.pages[loop];
-
- if (page->index * PAGE_SIZE >= start + len)
- break;
-
- psize = thp_size(page);
- start += psize;
- len -= psize;
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ folio = filemap_get_folio(mapping, index);
+ if (!folio) {
+ next = index + 1;
+ continue;
}
- __pagevec_release(&pv);
- } while (len > 0);
+ next = index + folio_nr_pages(folio);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
_leave("");
}
@@ -261,7 +244,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
struct address_space *mapping = vnode->vfs_inode.i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t end;
XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
@@ -272,15 +255,16 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign
rcu_read_lock();
end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, page, end) {
- if (!PageWriteback(page)) {
- kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end);
- ASSERT(PageWriteback(page));
+ xas_for_each(&xas, folio, end) {
+ if (!folio_test_writeback(folio)) {
+ kdebug("bad %x @%llx page %lx %lx",
+ len, start, folio_index(folio), end);
+ ASSERT(folio_test_writeback(folio));
}
- trace_afs_page_dirty(vnode, tracepoint_string("clear"), page);
- detach_page_private(page);
- page_endio(page, true, 0);
+ trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio);
+ folio_detach_private(folio);
+ folio_end_writeback(folio);
}
rcu_read_unlock();
@@ -437,7 +421,7 @@ static void afs_extend_writeback(struct address_space *mapping,
unsigned int *_len)
{
struct pagevec pvec;
- struct page *page;
+ struct folio *folio;
unsigned long priv;
unsigned int psize, filler = 0;
unsigned int f, t;
@@ -456,43 +440,43 @@ static void afs_extend_writeback(struct address_space *mapping,
*/
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
+ xas_for_each(&xas, folio, ULONG_MAX) {
stop = true;
- if (xas_retry(&xas, page))
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
break;
- if (page->index != index)
+ if (folio_index(folio) != index)
break;
- if (!page_cache_get_speculative(page)) {
+ if (!folio_try_get_rcu(folio)) {
xas_reset(&xas);
continue;
}
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas))) {
- put_page(page);
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
break;
}
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
break;
}
- if (!PageDirty(page) || PageWriteback(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- psize = thp_size(page);
- priv = page_private(page);
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ psize = folio_size(folio);
+ priv = (unsigned long)folio_get_private(folio);
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
if (f != 0 && !new_content) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
@@ -503,8 +487,8 @@ static void afs_extend_writeback(struct address_space *mapping,
else if (t == psize || new_content)
stop = false;
- index += thp_nr_pages(page);
- if (!pagevec_add(&pvec, page))
+ index += folio_nr_pages(folio);
+ if (!pagevec_add(&pvec, &folio->page))
break;
if (stop)
break;
@@ -521,16 +505,16 @@ static void afs_extend_writeback(struct address_space *mapping,
break;
for (i = 0; i < pagevec_count(&pvec); i++) {
- page = pvec.pages[i];
- trace_afs_page_dirty(vnode, tracepoint_string("store+"), page);
+ folio = page_folio(pvec.pages[i]);
+ trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio);
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
BUG();
- if (test_set_page_writeback(page))
+ if (folio_start_writeback(folio))
BUG();
- *_count -= thp_nr_pages(page);
- unlock_page(page);
+ *_count -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
pagevec_release(&pvec);
@@ -544,10 +528,10 @@ static void afs_extend_writeback(struct address_space *mapping,
* Synchronously write back the locked page and any subsequent non-locked dirty
* pages.
*/
-static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
- struct writeback_control *wbc,
- struct page *page,
- loff_t start, loff_t end)
+static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio,
+ loff_t start, loff_t end)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct iov_iter iter;
@@ -558,22 +542,22 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
long count = wbc->nr_to_write;
int ret;
- _enter(",%lx,%llx-%llx", page->index, start, end);
+ _enter(",%lx,%llx-%llx", folio_index(folio), start, end);
- if (test_set_page_writeback(page))
+ if (folio_start_writeback(folio))
BUG();
- count -= thp_nr_pages(page);
+ count -= folio_nr_pages(folio);
/* Find all consecutive lockable dirty pages that have contiguous
* written regions, stopping when we find a page that is not
* immediately lockable, is not dirty or is missing, or we reach the
* end of the range.
*/
- priv = page_private(page);
- offset = afs_page_dirty_from(page, priv);
- to = afs_page_dirty_to(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("store"), page);
+ priv = (unsigned long)folio_get_private(folio);
+ offset = afs_folio_dirty_from(folio, priv);
+ to = afs_folio_dirty_to(folio, priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio);
len = to - offset;
start += offset;
@@ -586,7 +570,7 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
max_len = min_t(unsigned long long, max_len, i_size - start);
if (len < max_len &&
- (to == thp_size(page) || new_content))
+ (to == folio_size(folio) || new_content))
afs_extend_writeback(mapping, vnode, &count,
start, max_len, new_content, &len);
len = min_t(loff_t, len, max_len);
@@ -596,7 +580,7 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
* set; the first page is still locked at this point, but all the rest
* have been unlocked.
*/
- unlock_page(page);
+ folio_unlock(folio);
if (start < i_size) {
_debug("write back %x @%llx [%llx]", len, start, i_size);
@@ -657,16 +641,17 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
* write a page back to the server
* - the caller locked the page for us
*/
-int afs_writepage(struct page *page, struct writeback_control *wbc)
+int afs_writepage(struct page *subpage, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(subpage);
ssize_t ret;
loff_t start;
- _enter("{%lx},", page->index);
+ _enter("{%lx},", folio_index(folio));
- start = page->index * PAGE_SIZE;
- ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
- start, LLONG_MAX - start);
+ start = folio_index(folio) * PAGE_SIZE;
+ ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc,
+ folio, start, LLONG_MAX - start);
if (ret < 0) {
_leave(" = %zd", ret);
return ret;
@@ -683,7 +668,8 @@ static int afs_writepages_region(struct address_space *mapping,
struct writeback_control *wbc,
loff_t start, loff_t end, loff_t *_next)
{
- struct page *page;
+ struct folio *folio;
+ struct page *head_page;
ssize_t ret;
int n;
@@ -693,13 +679,14 @@ static int afs_writepages_region(struct address_space *mapping,
pgoff_t index = start / PAGE_SIZE;
n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, 1, &page);
+ PAGECACHE_TAG_DIRTY, 1, &head_page);
if (!n)
break;
- start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */
+ folio = page_folio(head_page);
+ start = folio_pos(folio); /* May regress with THPs */
- _debug("wback %lx", page->index);
+ _debug("wback %lx", folio_index(folio));
/* At this point we hold neither the i_pages lock nor the
* page lock: the page may be truncated or invalidated
@@ -707,37 +694,38 @@ static int afs_writepages_region(struct address_space *mapping,
* back from swapper_space to tmpfs file mapping
*/
if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = lock_page_killable(page);
+ ret = folio_lock_killable(folio);
if (ret < 0) {
- put_page(page);
+ folio_put(folio);
return ret;
}
} else {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return 0;
}
}
- if (page->mapping != mapping || !PageDirty(page)) {
- start += thp_size(page);
- unlock_page(page);
- put_page(page);
+ if (folio_mapping(folio) != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ folio_put(folio);
continue;
}
- if (PageWriteback(page)) {
- unlock_page(page);
+ if (folio_test_writeback(folio)) {
+ folio_unlock(folio);
if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
- put_page(page);
+ folio_wait_writeback(folio);
+ folio_put(folio);
continue;
}
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
BUG();
- ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end);
- put_page(page);
+ ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
+ folio_put(folio);
if (ret < 0) {
_leave(" = %zd", ret);
return ret;
@@ -861,7 +849,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = thp_head(vmf->page);
+ struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -869,7 +857,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
unsigned long priv;
vm_fault_t ret = VM_FAULT_RETRY;
- _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
+ _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
afs_validate(vnode, af->key);
@@ -879,34 +867,34 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* be modified. We then assume the entire page will need writing back.
*/
#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page) &&
- wait_on_page_fscache_killable(page) < 0)
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0)
goto out;
#endif
- if (wait_on_page_writeback_killable(page))
+ if (folio_wait_writeback_killable(folio))
goto out;
- if (lock_page_killable(page) < 0)
+ if (folio_lock_killable(folio) < 0)
goto out;
- /* We mustn't change page->private until writeback is complete as that
+ /* We mustn't change folio->private until writeback is complete as that
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- if (wait_on_page_writeback_killable(page) < 0) {
- unlock_page(page);
+ if (folio_wait_writeback_killable(folio) < 0) {
+ folio_unlock(folio);
goto out;
}
- priv = afs_page_dirty(page, 0, thp_size(page));
- priv = afs_page_dirty_mmapped(priv);
- if (PagePrivate(page)) {
- set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page);
+ priv = afs_folio_dirty(folio, 0, folio_size(folio));
+ priv = afs_folio_dirty_mmapped(priv);
+ if (folio_test_private(folio)) {
+ folio_change_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio);
} else {
- attach_page_private(page, (void *)priv);
- trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page);
+ folio_attach_private(folio, (void *)priv);
+ trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio);
}
file_update_time(file);
@@ -947,38 +935,38 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/*
* Clean up a page during invalidation.
*/
-int afs_launder_page(struct page *page)
+int afs_launder_page(struct page *subpage)
{
- struct address_space *mapping = page->mapping;
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct folio *folio = page_folio(subpage);
+ struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
struct iov_iter iter;
struct bio_vec bv[1];
unsigned long priv;
unsigned int f, t;
int ret = 0;
- _enter("{%lx}", page->index);
+ _enter("{%lx}", folio_index(folio));
- priv = page_private(page);
- if (clear_page_dirty_for_io(page)) {
+ priv = (unsigned long)folio_get_private(folio);
+ if (folio_clear_dirty_for_io(folio)) {
f = 0;
- t = thp_size(page);
- if (PagePrivate(page)) {
- f = afs_page_dirty_from(page, priv);
- t = afs_page_dirty_to(page, priv);
+ t = folio_size(folio);
+ if (folio_test_private(folio)) {
+ f = afs_folio_dirty_from(folio, priv);
+ t = afs_folio_dirty_to(folio, priv);
}
- bv[0].bv_page = page;
+ bv[0].bv_page = &folio->page;
bv[0].bv_offset = f;
bv[0].bv_len = t - f;
iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
- trace_afs_page_dirty(vnode, tracepoint_string("launder"), page);
- ret = afs_store_data(vnode, &iter, page_offset(page) + f, true);
+ trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
+ ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
}
- trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page);
- detach_page_private(page);
- wait_on_page_fscache(page);
+ trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio);
+ folio_detach_private(folio);
+ folio_wait_fscache(folio);
return ret;
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 2b35cba8ad62..fdc7d675b4b0 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -83,25 +83,18 @@ static s64 linux_to_yfs_time(const struct timespec64 *t)
return (u64)t->tv_sec * 10000000 + t->tv_nsec/100;
}
-static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode)
-{
- struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
-
- x->mask = htonl(AFS_SET_MODE);
- x->mode = htonl(mode & S_IALLUGO);
- x->mtime_client = u64_to_xdr(0);
- x->owner = u64_to_xdr(0);
- x->group = u64_to_xdr(0);
- return bp + xdr_size(x);
-}
-
-static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t)
+static __be32 *xdr_encode_YFSStoreStatus(__be32 *bp, mode_t *mode,
+ const struct timespec64 *t)
{
struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ mode_t masked_mode = mode ? *mode & S_IALLUGO : 0;
s64 mtime = linux_to_yfs_time(t);
+ u32 mask = AFS_SET_MTIME;
- x->mask = htonl(AFS_SET_MTIME);
- x->mode = htonl(0);
+ mask |= mode ? AFS_SET_MODE : 0;
+
+ x->mask = htonl(mask);
+ x->mode = htonl(masked_mode);
x->mtime_client = u64_to_xdr(mtime);
x->owner = u64_to_xdr(0);
x->group = u64_to_xdr(0);
@@ -576,7 +569,7 @@ void yfs_fs_create_file(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode);
+ bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
@@ -625,7 +618,7 @@ void yfs_fs_make_dir(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
- bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode);
+ bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
yfs_check_req(call, bp);
trace_afs_make_fs_call1(call, &dvp->fid, name);
@@ -946,6 +939,7 @@ void yfs_fs_symlink(struct afs_operation *op)
struct afs_vnode_param *dvp = &op->file[0];
struct afs_call *call;
size_t contents_sz;
+ mode_t mode = 0777;
__be32 *bp;
_enter("");
@@ -972,7 +966,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &dvp->fid);
bp = xdr_encode_name(bp, name);
bp = xdr_encode_string(bp, op->create.symlink, contents_sz);
- bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO);
+ bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
trace_afs_make_fs_call1(call, &dvp->fid, name);
@@ -1103,7 +1097,7 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u32(bp, YFSSTOREDATA64);
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vp->fid);
- bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime);
+ bp = xdr_encode_YFSStoreStatus(bp, NULL, &op->mtime);
bp = xdr_encode_u64(bp, op->store.pos);
bp = xdr_encode_u64(bp, op->store.size);
bp = xdr_encode_u64(bp, op->store.i_size);
diff --git a/fs/aio.c b/fs/aio.c
index 51b08ab01dff..9c81cf611d65 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -659,8 +659,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
new_nr = (table ? table->nr : 1) * 4;
spin_unlock(&mm->ioctx_lock);
- table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
- new_nr, GFP_KERNEL);
+ table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -1417,7 +1416,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
{
struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
@@ -1437,7 +1436,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
}
iocb->ki_res.res = res;
- iocb->ki_res.res2 = res2;
+ iocb->ki_res.res2 = 0;
iocb_put(iocb);
}
@@ -1508,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
}
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index a280156138ed..e0c3e33c4177 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -148,6 +148,35 @@ struct file *anon_inode_getfile(const char *name,
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);
+/**
+ * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * !S_PRIVATE anon inode rather than reuse the
+ * singleton anon inode and calls the
+ * inode_init_security_anon() LSM hook. This
+ * allows for both the inode to have its own
+ * security context and for the LSM to enforce
+ * policy on the inode's creation.
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ * @context_inode:
+ * [in] the logical relationship with the new inode (optional)
+ *
+ * The LSM may use @context_inode in inode_init_security_anon(), but a
+ * reference to it is not held. Returns the newly created file* or an error
+ * pointer. See the anon_inode_getfile() documentation for more information.
+ */
+struct file *anon_inode_getfile_secure(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags,
+ const struct inode *context_inode)
+{
+ return __anon_inode_getfile(name, fops, priv, flags,
+ context_inode, true);
+}
+
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a813b70f594e..f8c7f26f1fbb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -156,7 +156,7 @@ static int padzero(unsigned long elf_bss)
#define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
#define STACK_ROUND(sp, items) \
(((unsigned long) (sp - items)) &~ 15UL)
-#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
+#define STACK_ALLOC(sp, len) (sp -= len)
#endif
#ifndef ELF_BASE_PLATFORM
@@ -1074,20 +1074,26 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * If we are loading ET_EXEC or we have already performed
- * the ET_DYN load_addr calculations, proceed normally.
+ * The first time through the loop, load_addr_set is false:
+ * layout will be calculated. Once set, use MAP_FIXED since
+ * we know we've already safely mapped the entire region with
+ * MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (elf_ex->e_type == ET_EXEC || load_addr_set) {
+ if (load_addr_set) {
elf_flags |= MAP_FIXED;
+ } else if (elf_ex->e_type == ET_EXEC) {
+ /*
+ * This logic is run once for the first LOAD Program
+ * Header for ET_EXEC binaries. No special handling
+ * is needed.
+ */
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else if (elf_ex->e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
- * Program Headers, and to calculate the entire
- * size of the ELF mapping (total_size). (Note that
- * load_addr_set is set to true later once the
- * initial mapping is performed.)
+ * Program Headers.
*
* There are effectively two types of ET_DYN
* binaries: programs (i.e. PIE: ET_DYN with INTERP)
@@ -1108,7 +1114,7 @@ out_free_interp:
* Therefore, programs are loaded offset from
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
- * without MAP_FIXED).
+ * without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
@@ -1117,7 +1123,7 @@ out_free_interp:
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
- elf_flags |= MAP_FIXED;
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else
load_bias = 0;
@@ -1129,7 +1135,14 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
+ }
+ /*
+ * Calculate the entire size of the ELF mapping (total_size).
+ * (Note that load_addr_set is set to true later once the
+ * initial mapping is performed.)
+ */
+ if (!load_addr_set) {
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1834,7 +1847,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+ for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -2024,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
if (!elf_note_info_init(info))
return 0;
- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
ets = kzalloc(sizeof(*ets), GFP_KERNEL);
if (!ets)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d8fd6030cbb..c6f588dc4a9d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
goto end_coredump;
- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
ct->task, &thread_status_size);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a3b830b8410a..444e9c89ff3e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
@@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
+ kfree(cache->physical_map);
kfree(cache);
}
}
@@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cluster->refill_lock);
btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path();
if (!path) {
@@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ return bg1->used > bg2->used;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
int ret = 0;
@@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
+ INIT_LIST_HEAD(&cache->active_bg_list);
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
@@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
*/
if (btrfs_is_zoned(info)) {
btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
@@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info,
link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->start)) {
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
inc_block_group_ro(cache, 1);
- } else if (cache->used == 0) {
- ASSERT(list_empty(&cache->bg_list));
- if (btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_discard_queue_work(&info->discard_ctl, cache);
- else
- btrfs_mark_bg_unused(cache);
}
+
return 0;
error:
btrfs_put_block_group(cache);
@@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
+ /*
+ * New block group is likely to be used soon. Try to activate it now.
+ * Failure is OK for now.
+ */
+ btrfs_zone_activate(cache);
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, 0, &cache->space_info);
+ cache->bytes_super, cache->zone_unusable,
+ &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
if (!--cache->ro) {
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
- cache->zone_unusable = cache->alloc_offset - cache->used;
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
@@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc)
+ u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_block_group *cache = NULL;
@@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
*/
check_system_chunk(trans, flags);
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
}
- /*
- * If this is a system chunk allocation then stop right here and do not
- * add the chunk item to the chunk btree. This is to prevent a deadlock
- * because this system chunk allocation can be triggered while COWing
- * some extent buffer of the chunk btree and while holding a lock on a
- * parent extent buffer, in which case attempting to insert the chunk
- * item (or update the device item) would result in a deadlock on that
- * parent extent buffer. In this case defer the chunk btree updates to
- * the second phase of chunk allocation and keep our reservation until
- * the second phase completes.
- *
- * This is a rare case and can only be triggered by the very few cases
- * we have where we need to touch the chunk btree outside chunk allocation
- * and chunk removal. These cases are basically adding a device, removing
- * a device or resizing a device.
- */
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- return 0;
-
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
* previously reserved space in the system space_info and allocated one
- * new system chunk if necessary. However there are two exceptions:
+ * new system chunk if necessary. However there are three exceptions:
*
* 1) We may have enough free space in the system space_info but all the
* existing system block groups have a profile which can not be used
@@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
* with enough free space got turned into RO mode by a running scrub,
* and in this case we have to allocate a new one and retry. We only
* need do this allocate and retry once, since we have a transaction
- * handle and scrub uses the commit root to search for block groups.
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
*/
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3519,7 +3546,15 @@ out:
* properly, either intentionally or as a bug. One example where this is
* done intentionally is fsync, as it does not reserve any transaction units
* and ends up allocating a variable number of metadata extents for log
- * tree extent buffers.
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
*
* We also need this 2 phases setup when adding a device to a filesystem with
* a seed device - we must create new metadata and system chunks without adding
@@ -3537,14 +3572,14 @@ out:
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
@@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (trans->allocating_chunk)
return -ENOSPC;
/*
- * If we are removing a chunk, don't re-enter or we would deadlock.
- * System space reservation and system chunk allocation is done by the
- * chunk remove operation (btrfs_remove_chunk()).
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
*/
- if (trans->removing_chunk)
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
@@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
@@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
- *
- * Also, if our caller is allocating a system chunk, do not
- * attempt to insert the chunk item in the chunk btree, as we
- * could deadlock on an extent buffer since our caller may be
- * COWing an extent buffer from the chunk btree.
*/
- bg = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ } else {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
- * any error here.
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
@@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
@@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c72a71efcb18..5878b7ce3b78 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -98,6 +98,7 @@ struct btrfs_block_group {
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
+ unsigned int zone_is_active:1;
int disk_cache_state;
@@ -202,7 +203,10 @@ struct btrfs_block_group {
*/
u64 alloc_offset;
u64 zone_unusable;
+ u64 zone_capacity;
u64 meta_write_pointer;
+ struct map_lookup *physical_map;
+ struct list_head active_bg_list;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc);
+ u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 76ee1452c57b..ab2a4a52e0bb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,17 +138,34 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- /* total number of bytes pending delalloc, used by stat to calc the
- * real block usage of the file
- */
- u64 delalloc_bytes;
-
- /*
- * Total number of bytes pending delalloc that fall within a file
- * range that is either a hole or beyond EOF (and no prealloc extent
- * exists in the range). This is always <= delalloc_bytes.
- */
- u64 new_delalloc_bytes;
+ union {
+ /*
+ * Total number of bytes pending delalloc, used by stat to
+ * calculate the real block usage of the file. This is used
+ * only for files.
+ */
+ u64 delalloc_bytes;
+ /*
+ * The offset of the last dir item key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_item_offset;
+ };
+
+ union {
+ /*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes and this
+ * is used only for files.
+ */
+ u64 new_delalloc_bytes;
+ /*
+ * The offset of the last dir index key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_index_offset;
+ };
/*
* total number of bytes pending defrag, used by stat to check whether
@@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
struct btrfs_dio_private {
struct inode *inode;
- u64 logical_offset;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
u64 disk_bytenr;
/* Used for bio::bi_size */
u32 bytes;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 86816088927f..7e9f90fa0388 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -186,7 +186,6 @@ struct btrfsic_dev_state {
struct list_head collision_resolving_node; /* list node */
struct btrfsic_block dummy_block_for_bio_bh_flush;
u64 last_flush_gen;
- char name[BDEVNAME_SIZE];
};
struct btrfsic_block_hashtable {
@@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
ds->bdev = NULL;
ds->state = NULL;
- ds->name[0] = '\0';
INIT_LIST_HEAD(&ds->collision_resolving_node);
ds->last_flush_gen = 0;
btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+ "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
- dev_state->name, dev_bytenr,
+ dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
@@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame:
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
leaf_item_out_of_bounce_error:
- pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(sf->block_ctx,
@@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame:
(uintptr_t)nodehdr;
if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
sf->block_ctx->len) {
- pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(
@@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block(
if (next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr))
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block),
next_block->logical_bytenr);
else
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+ "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block));
@@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset +
offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
@@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data(
next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
next_bytenr,
- next_block_ctx.dev->name,
+ next_block_ctx.dev->bdev,
next_block_ctx.dev_bytenr,
mirror_num,
next_block->logical_bytenr);
@@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
- struct btrfs_bio *multi = NULL;
+ struct btrfs_io_context *multi = NULL;
struct btrfs_device *device;
length = len;
@@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(num_pages - i);
+ bio = btrfs_bio_alloc(num_pages - i);
bio_set_dev(bio, block_ctx->dev->bdev);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %s!\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+ block_ctx->start, block_ctx->dev->bdev);
bio_put(bio);
return -1;
}
@@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
const struct btrfsic_block_link *l;
- pr_info("%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("%c-block @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
}
@@ -1743,16 +1748,18 @@ again:
if (block->logical_bytenr != bytenr &&
!(!block->is_metadata &&
block->logical_bytenr == 0))
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- bytenr, dev_state->name,
+ pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ bytenr, dev_state->bdev,
dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state,
block),
block->logical_bytenr);
else
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev,
dev_bytenr, block->mirror_num,
btrfsic_get_block_type(state,
block));
@@ -1767,8 +1774,9 @@ again:
processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev, dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state, block));
}
@@ -1778,9 +1786,10 @@ again:
list_empty(&block->ref_to_list) ? ' ' : '!',
list_empty(&block->ref_from_list) ? ' ' : '!');
if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_disk_key_objectid(&block->disk_key),
block->disk_key.type,
@@ -1792,9 +1801,10 @@ again:
}
if (!block->is_iodone && !block->never_written) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_stack_header_generation(
(struct btrfs_header *)
@@ -1921,8 +1931,9 @@ again:
if (!is_metadata) {
processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
- dev_state->name, dev_bytenr);
+ pr_info(
+ "written block (%pg/%llu/?) !found in hash table, D\n",
+ dev_state->bdev, dev_bytenr);
if (!state->include_extent_data) {
/* ignore that written D block */
goto continue_loop;
@@ -1939,8 +1950,9 @@ again:
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+ "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+ bytenr, dev_state->bdev, dev_bytenr);
}
block_ctx.dev = dev_state;
@@ -1995,9 +2007,9 @@ again:
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+ pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, block->mirror_num);
next_block = block->next_in_same_bio;
block->iodone_w_error = iodone_w_error;
@@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %s flush_gen=%llu\n",
- dev_state->name,
+ pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+ dev_state->bdev,
dev_state->last_flush_gen);
}
if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock(
if (!(superblock->generation > state->max_superblock_generation ||
0 == state->max_superblock_generation)) {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+ pr_info(
+ "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
} else {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+ pr_info(
+ "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
@@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
*/
list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
if (l->block_ref_to->never_written) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (!l->block_ref_to->is_iodone) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (l->block_ref_to->iodone_w_error) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
@@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
l->parent_generation &&
BTRFSIC_GENERATION_UNKNOWN !=
l->block_ref_to->generation) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num,
l->block_ref_to->generation,
@@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
ret = -1;
} else if (l->block_ref_to->flush_gen >
l->block_ref_to->dev_state->last_flush_gen) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num, block->flush_gen,
l->block_ref_to->dev_state->last_flush_gen);
@@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock(
*/
list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
if (l->block_ref_from->is_superblock &&
@@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock(
static void btrfsic_print_add_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
static void btrfsic_print_rem_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
@@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+ indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
printk("[...]\n");
@@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
block->never_written = never_written;
block->mirror_num = mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
additional_string,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
}
if (WARN_ON(!match)) {
- pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+ bytenr, dev_state->bdev, dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
ret = btrfsic_map_block(state, bytenr,
state->metablock_size,
@@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
if (ret)
continue;
- pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
- bytenr, block_ctx.dev->name,
+ pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+ bytenr, block_ctx.dev->bdev,
block_ctx.dev_bytenr, mirror_num);
}
}
@@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
} else {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
@@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
list_for_each_entry(device, dev_head, dev_list) {
struct btrfsic_dev_state *ds;
- const char *p;
if (!device->bdev || !device->name)
continue;
@@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
}
ds->bdev = device->bdev;
ds->state = state;
- bdevname(ds->bdev, ds->name);
- ds->name[BDEVNAME_SIZE - 1] = '\0';
- p = kbasename(ds->name);
- strlcpy(ds->name, p, sizeof(ds->name));
btrfsic_dev_state_hashtable_add(ds,
&btrfsic_dev_state_hashtable);
}
@@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
- pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0913ee50e6c3..32da97c3c19d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -28,6 +29,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "subpage.h"
#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -180,9 +182,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- if (btrfs_io_bio(bio)->device)
+ if (btrfs_bio(bio)->device)
btrfs_dev_stat_inc_and_print(
- btrfs_io_bio(bio)->device,
+ btrfs_bio(bio)->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
return -EIO;
}
@@ -193,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
return 0;
}
+/*
+ * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+ *
+ * Return true if there is no pending bio nor io.
+ * Return false otherwise.
+ */
+static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ unsigned int bi_size = 0;
+ bool last_io = false;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+ * Thus here we have to iterate through all segments to grab correct
+ * bio size.
+ */
+ bio_for_each_segment_all(bvec, bio, iter_all)
+ bi_size += bvec->bv_len;
+
+ if (bio->bi_status)
+ cb->errors = 1;
+
+ ASSERT(bi_size && bi_size <= cb->compressed_len);
+ last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+ &cb->pending_sectors);
+ /*
+ * Here we must wake up the possible error handler after all other
+ * operations on @cb finished, or we can race with
+ * finish_compressed_bio_*() which may free @cb.
+ */
+ wake_up_var(cb);
+
+ return last_io;
+}
+
+static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+{
+ unsigned int index;
+ struct page *page;
+
+ /* Release the compressed pages */
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ put_page(page);
+ }
+
+ /* Do io completion on the original bio */
+ if (cb->errors) {
+ bio_io_error(cb->orig_bio);
+ } else {
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(bio);
+ ASSERT(!bio->bi_status);
+ /*
+ * We have verified the checksum already, set page checked so
+ * the end_io handlers know about it
+ */
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+ u64 bvec_start = page_offset(bvec->bv_page) +
+ bvec->bv_offset;
+
+ btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+ bvec->bv_page, bvec_start,
+ bvec->bv_len);
+ }
+
+ bio_endio(cb->orig_bio);
+ }
+
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
+
/* when we finish reading compressed pages from the disk, we
* decompress them and then run the bio end_io routines on the
* decompressed pages (in the inode address space).
@@ -207,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
- struct page *page;
- unsigned int index;
- unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+ unsigned int mirror = btrfs_bio(bio)->mirror_num;
int ret = 0;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
+ if (!dec_and_test_compressed_bio(cb, bio))
goto out;
/*
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+ btrfs_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
/*
@@ -249,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
cb->errors = 1;
-
- /* release the compressed pages */
- index = 0;
- for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
- page->mapping = NULL;
- put_page(page);
- }
-
- /* do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * we have verified the checksum already, set page
- * checked so the end_io handlers know about it
- */
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
- SetPageChecked(bvec->bv_page);
-
- bio_endio(cb->orig_bio);
- }
-
- /* finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
+ finish_compressed_bio_read(cb, bio);
out:
bio_put(bio);
}
@@ -290,6 +336,7 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
@@ -312,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode,
for (i = 0; i < ret; i++) {
if (cb->errors)
SetPageError(pages[i]);
- end_page_writeback(pages[i]);
+ btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ cb->start, cb->len);
put_page(pages[i]);
}
nr_pages -= ret;
@@ -321,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode,
/* the inode may be gone now */
}
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- struct page *page;
+ struct inode *inode = cb->inode;
unsigned int index;
- if (bio->bi_status)
- cb->errors = 1;
-
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
- goto out;
-
- /* ok, we're the last bio for this extent, step one is to
- * call back into the FS and do all the end_io operations
+ /*
+ * Ok, we're the last bio for this extent, step one is to call back
+ * into the FS and do all the end_io operations.
*/
- inode = cb->inode;
- btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
!cb->errors);
end_compressed_writeback(inode, cb);
- /* note, our inode could be gone now */
+ /* Note, our inode could be gone now */
/*
- * release the compressed pages, these came from alloc_page and
+ * Release the compressed pages, these came from alloc_page and
* are not attached to the inode at all
*/
- index = 0;
for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
+ struct page *page = cb->compressed_pages[index];
+
page->mapping = NULL;
put_page(page);
}
- /* finally free the cb struct */
+ /* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk. This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio)
+{
+ struct compressed_bio *cb = bio->bi_private;
+
+ if (!dec_and_test_compressed_bio(cb, bio))
+ goto out;
+
+ btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+
+ finish_compressed_bio_write(cb);
out:
bio_put(bio);
}
+static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+ struct compressed_bio *cb,
+ struct bio *bio, int mirror_num)
+{
+ blk_status_t ret;
+
+ ASSERT(bio->bi_iter.bi_size);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ return ret;
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ return ret;
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb: The compressed_bio structure, which records all the needed
+ * information to bind the compressed data to the uncompressed
+ * page cache.
+ * @disk_byten: The logical bytenr where the compressed data will be read
+ * from or written to.
+ * @endio_func: The endio function to call after the IO for compressed data
+ * is finished.
+ * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
+ * Let the caller know to only fill the bio up to the stripe
+ * boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ unsigned int opf, bio_end_io_t endio_func,
+ u64 *next_stripe_start)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct btrfs_io_geometry geom;
+ struct extent_map *em;
+ struct bio *bio;
+ int ret;
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bio->bi_opf = opf;
+ bio->bi_private = cb;
+ bio->bi_end_io = endio_func;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ if (IS_ERR(em)) {
+ bio_put(bio);
+ return ERR_CAST(em);
+ }
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ bio_put(bio);
+ return ERR_PTR(ret);
+ }
+ *next_stripe_start = disk_bytenr + geom.len;
+
+ return bio;
+}
+
/*
* worker function to build and submit bios for previously compressed pages.
* The corresponding pages in the inode should be marked for writeback
@@ -395,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
- unsigned long bytes_left;
- int pg_index = 0;
- struct page *page;
- u64 first_byte = disk_start;
+ u64 cur_disk_bytenr = disk_start;
+ u64 next_stripe_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
- WARN_ON(!PAGE_ALIGNED(start));
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
cb->errors = 0;
cb->inode = &inode->vfs_inode;
cb->start = start;
@@ -419,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
-
- if (use_append) {
- struct btrfs_device *device;
-
- device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
- if (IS_ERR(device)) {
- kfree(cb);
- bio_put(bio);
- return BLK_STS_NOTSUPP;
+ while (cur_disk_bytenr < disk_start + compressed_len) {
+ u64 offset = cur_disk_bytenr - disk_start;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!bio) {
+ bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ bio_op | write_flags, end_compressed_bio_write,
+ &next_stripe_start);
+ if (IS_ERR(bio)) {
+ ret = errno_to_blk_status(PTR_ERR(bio));
+ bio = NULL;
+ goto finish_cb;
+ }
}
-
- bio_set_dev(bio, device->bdev);
- }
-
- if (blkcg_css) {
- bio->bi_opf |= REQ_CGROUP_PUNT;
- kthread_associate_blkcg(blkcg_css);
- }
- refcount_set(&cb->pending_bios, 1);
-
- /* create and submit bios for the compressed pages */
- bytes_left = compressed_len;
- for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
- int submit = 0;
- int len = 0;
-
- page = compressed_pages[pg_index];
- page->mapping = inode->vfs_inode.i_mapping;
- if (bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
- 0);
-
/*
- * Page can only be added to bio if the current bio fits in
- * stripe.
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
*/
- if (!submit) {
- if (pg_index == 0 && use_append)
- len = bio_add_zone_append_page(bio, page,
- PAGE_SIZE, 0);
- else
- len = bio_add_page(bio, page, PAGE_SIZE, 0);
- }
-
- page->mapping = NULL;
- if (submit || len < PAGE_SIZE) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ ASSERT(cur_disk_bytenr != next_stripe_start);
+ /*
+ * We have various limits on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ if (use_append)
+ added = bio_add_zone_append_page(bio, page, real_size,
+ offset_in_page(offset));
+ else
+ added = bio_add_page(bio, page, real_size,
+ offset_in_page(offset));
+ /* Reached zoned boundary */
+ if (added == 0)
+ submit = true;
+
+ cur_disk_bytenr += added;
+ /* Reached stripe boundary */
+ if (cur_disk_bytenr == next_stripe_start)
+ submit = true;
+
+ /* Finished the range */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ submit = true;
+
+ if (submit) {
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
-
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ if (ret)
+ goto finish_cb;
}
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = bio_op | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
- if (blkcg_css)
- bio->bi_opf |= REQ_CGROUP_PUNT;
- /*
- * Use bio_add_page() to ensure the bio has at least one
- * page.
- */
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ if (ret)
+ goto finish_cb;
+ bio = NULL;
}
- if (bytes_left < PAGE_SIZE) {
- btrfs_info(fs_info,
- "bytes left %lu compress len %u nr %u",
- bytes_left, cb->compressed_len, cb->nr_pages);
- }
- bytes_left -= PAGE_SIZE;
- first_byte += PAGE_SIZE;
cond_resched();
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
+ return 0;
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
+finish_cb:
+ if (bio) {
bio->bi_status = ret;
bio_endio(bio);
}
+ /* Last byte of @cb is submitted, endio will free @cb */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ return ret;
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
-
- return 0;
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_start + compressed_len - cur_disk_bytenr) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_write(cb);
+ return ret;
}
static u64 bio_end_offset(struct bio *bio)
@@ -540,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio)
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
- unsigned long pg_index;
- u64 last_offset;
+ u64 cur = bio_end_offset(cb->orig_bio);
u64 isize = i_size_read(inode);
int ret;
struct page *page;
- unsigned long nr_pages = 0;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
- u64 end;
- int misses = 0;
+ int sectors_missed = 0;
- last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
@@ -577,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_SHIFT;
+ while (cur < compressed_end) {
+ u64 page_end;
+ u64 pg_index = cur >> PAGE_SHIFT;
+ u32 add_size;
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
- misses++;
- if (misses > 4)
+ sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ fs_info->sectorsize_bits;
+
+ /* Beyond threshold, no need to continue */
+ if (sectors_missed > 4)
break;
- goto next;
+
+ /*
+ * Jump to next page start as we already have page for
+ * current offset.
+ */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -598,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
- goto next;
+ /* There is already a page, skip to page end */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
- /*
- * at this point, we have a locked page in the page cache
- * for these bytes in the file. But, we have to make
- * sure they map to this compressed extent on disk.
- */
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -613,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
}
- end = last_offset + PAGE_SIZE - 1;
- lock_extent(tree, last_offset, end);
+ page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ lock_extent(tree, cur, page_end);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_SIZE);
+ em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
- if (!em || last_offset < em->start ||
- (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+ /*
+ * At this point, we have a locked page in the page cache for
+ * these bytes in the file. But, we have to make sure they map
+ * to this compressed extent on disk.
+ */
+ if (!em || cur < em->start ||
+ (cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end);
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
@@ -642,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- ret = bio_add_page(cb->orig_bio, page,
- PAGE_SIZE, 0);
-
- if (ret == PAGE_SIZE) {
- nr_pages++;
- put_page(page);
- } else {
- unlock_extent(tree, last_offset, end);
+ add_size = min(em->start + em->len, page_end + 1) - cur;
+ ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ if (ret != add_size) {
+ unlock_extent(tree, cur, page_end);
unlock_page(page);
put_page(page);
break;
}
-next:
- last_offset += PAGE_SIZE;
+ /*
+ * If it's subpage, we also need to increase its
+ * subpage::readers number, as at endio we will decrease
+ * subpage::readers and to unlock the page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ put_page(page);
+ cur += add_size;
}
return 0;
}
@@ -680,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned int compressed_len;
unsigned int nr_pages;
unsigned int pg_index;
- struct page *page;
- struct bio *comp_bio;
- u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ struct bio *comp_bio = NULL;
+ const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_byte = disk_bytenr;
+ u64 next_stripe_start;
u64 file_offset;
u64 em_len;
u64 em_start;
@@ -709,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb)
goto out;
- refcount_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
@@ -749,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
- refcount_set(&cb->pending_bios, 1);
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- u32 pg_len = PAGE_SIZE;
- int submit = 0;
+ while (cur_disk_byte < disk_bytenr + compressed_len) {
+ u64 offset = cur_disk_byte - disk_bytenr;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = cb->compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!comp_bio) {
+ comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ REQ_OP_READ, end_compressed_bio_read,
+ &next_stripe_start);
+ if (IS_ERR(comp_bio)) {
+ ret = errno_to_blk_status(PTR_ERR(comp_bio));
+ comp_bio = NULL;
+ goto finish_cb;
+ }
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_byte != next_stripe_start);
+ /*
+ * We have various limit on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
/*
- * To handle subpage case, we need to make sure the bio only
- * covers the range we need.
- *
- * If we're at the last page, truncate the length to only cover
- * the remaining part.
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
*/
- if (pg_index == nr_pages - 1)
- pg_len = min_t(u32, PAGE_SIZE,
- compressed_len - pg_index * PAGE_SIZE);
+ ASSERT(added == real_size);
+ cur_disk_byte += added;
- page = cb->compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_SHIFT;
+ /* Reached stripe boundary, need to submit */
+ if (cur_disk_byte == next_stripe_start)
+ submit = true;
- if (comp_bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, pg_len,
- comp_bio, 0);
+ /* Has finished the range, need to submit */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ submit = true;
- page->mapping = NULL;
- if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+ if (submit) {
unsigned int nr_sectors;
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
-
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ goto finish_cb;
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
-
- bio_add_page(comp_bio, page, pg_len, 0);
+ ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ if (ret)
+ goto finish_cb;
+ comp_bio = NULL;
}
- cur_disk_byte += pg_len;
}
-
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- BUG_ON(ret); /* -ENOMEM */
-
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
return 0;
fail2:
@@ -843,6 +951,26 @@ fail1:
out:
free_extent_map(em);
return ret;
+finish_cb:
+ if (comp_bio) {
+ comp_bio->bi_status = ret;
+ bio_endio(comp_bio);
+ }
+ /* All bytes of @cb is submitted, endio will free @cb */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ return ret;
+
+ wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ (disk_bytenr + compressed_len - cur_disk_byte) >>
+ fs_info->sectorsize_bits);
+ /*
+ * Even with previous bio ended, we should still have io not yet
+ * submitted, thus need to finish @cb manually.
+ */
+ ASSERT(refcount_read(&cb->pending_sectors));
+ /* Now we are the only one referring @cb, can finish it safely. */
+ finish_compressed_bio_read(cb, NULL);
+ return ret;
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 399be0b435bf..56eef0821e3e 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -28,8 +28,8 @@ struct btrfs_inode;
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* number of bios pending for this compressed extent */
- refcount_t pending_bios;
+ /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+ refcount_t pending_sectors;
/* Number of compressed pages in the array */
unsigned int nr_pages;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84627cbd5b5b..c3983bdaf4b8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (*cow_ret == buf)
unlock_orig = 1;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
@@ -2487,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
int ret;
BUG_ON(!path->nodes[level]);
- btrfs_assert_tree_locked(path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2827,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
@@ -3065,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (right_nritems == 0)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
@@ -3581,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
}
/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct btrfs_key *new_key)
-{
- struct extent_buffer *leaf;
- int ret;
- u32 item_size;
-
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ret = setup_leaf_for_split(trans, root, path,
- item_size + sizeof(struct btrfs_item));
- if (ret)
- return ret;
-
- path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size, 1);
- leaf = path->nodes[0];
- memcpy_extent_buffer(leaf,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- item_size);
- return 0;
-}
-
-/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -3785,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
- * @cpu_key: array of keys for items to be inserted
- * @data_size: size of the body of each item we are going to insert
- * @nr: size of @cpu_key/@data_size arrays
+ * @batch: information about the batch of items to insert
*/
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -3803,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
int slot;
struct btrfs_map_token token;
u32 total_size;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
- total_size = total_data + (nr * sizeof(struct btrfs_item));
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
if (path->slots[0] == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -3820,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
@@ -3849,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item,
- ioff - total_data);
+ ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- data_end -= data_size[i];
+ data_end -= batch->data_sizes[i];
btrfs_set_token_item_offset(&token, item, data_end);
- btrfs_set_token_item_size(&token, item, data_size[i]);
+ btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
}
- btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3883,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+ const struct btrfs_item_batch *batch)
{
int ret = 0;
int slot;
- int i;
- u32 total_size = 0;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
+ u32 total_size;
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
@@ -3911,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size, nr);
+ setup_items_for_insert(root, path, batch);
return 0;
}
@@ -3943,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c0cebcf745ce..7553e9dc5f93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
+struct btrfs_bio;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -217,6 +218,9 @@ struct btrfs_root_backup {
u8 unused_8[10];
} __attribute__ ((__packed__));
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+
/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
@@ -269,7 +273,11 @@ struct btrfs_super_block {
__le64 reserved[28];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ u8 padding[565];
} __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
* Compat flags that we support. If any incompat flags are set other than the
@@ -899,6 +907,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -1017,6 +1026,16 @@ struct btrfs_fs_info {
spinlock_t treelog_bg_lock;
u64 treelog_bg;
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+ /*
+ * Pointer to an array containing the keys of the items to insert (in
+ * sorted order).
+ */
+ const struct btrfs_key *keys;
+ /* Pointer to an array containing the data size for each item to insert. */
+ const u32 *data_sizes;
+ /*
+ * The sum of data sizes for all items. The caller can compute this while
+ * setting up the data_sizes array, so it ends up being more efficient
+ * than having btrfs_insert_empty_items() or setup_item_for_insert()
+ * doing it, as it would avoid an extra loop over a potentially large
+ * array, and in the case of setup_item_for_insert(), we would be doing
+ * it while holding a write lock on a leaf and often on upper level nodes
+ * too, unnecessarily increasing the size of a critical section.
+ */
+ u32 total_data_size;
+ /* Size of the keys and data_sizes arrays (number of items in the batch). */
+ int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+ const struct btrfs_item_batch *batch);
static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
const struct btrfs_key *key,
u32 data_size)
{
- return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ return btrfs_insert_empty_items(trans, root, path, &batch);
}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len);
int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
@@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_pages);
+ u64 newer_than, unsigned long max_to_defrag);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3563,6 +3613,9 @@ do { \
(errno), fmt, ##args); \
} while (0)
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zoned != 0;
}
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+ return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
/*
* We use page status Private2 to indicate there is an ordered extent with
* unfinished IO.
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1e08eb2b27f0..e164766dcc38 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
- LIST_HEAD(batch);
+ LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ struct btrfs_item_batch batch;
int total_size;
- int nitems;
char *ins_data = NULL;
- struct btrfs_key *ins_keys;
- u32 *ins_sizes;
int ret;
- list_add_tail(&first_item->tree_list, &batch);
- nitems = 1;
+ list_add_tail(&first_item->tree_list, &item_list);
+ batch.total_data_size = first_item->data_len;
+ batch.nr = 1;
total_size = first_item->data_len + sizeof(struct btrfs_item);
curr = first_item;
@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
if (total_size + next_size > max_size)
break;
- list_add_tail(&next->tree_list, &batch);
- nitems++;
+ list_add_tail(&next->tree_list, &item_list);
+ batch.nr++;
total_size += next_size;
+ batch.total_data_size += next->data_len;
curr = next;
}
- if (nitems == 1) {
- ins_keys = &first_item->key;
- ins_sizes = &first_item->data_len;
+ if (batch.nr == 1) {
+ batch.keys = &first_item->key;
+ batch.data_sizes = &first_item->data_len;
} else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(nitems * sizeof(u32) +
- nitems * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc(batch.nr * sizeof(u32) +
+ batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data) {
ret = -ENOMEM;
goto out;
}
ins_sizes = (u32 *)ins_data;
- ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
- list_for_each_entry(curr, &batch, tree_list) {
+ ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ list_for_each_entry(curr, &item_list, tree_list) {
ins_keys[i] = curr->key;
ins_sizes[i] = curr->data_len;
i++;
}
}
- ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
- nitems);
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret)
goto out;
- list_for_each_entry(curr, &batch, tree_list) {
+ list_for_each_entry(curr, &item_list, tree_list) {
char *data_ptr;
data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
- list_for_each_entry_safe(curr, next, &batch, tree_list) {
+ list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ca848b183474..cca7e85e32dd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(generic_ref->real_root) &&
- is_fstree(generic_ref->tree_ref.root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
@@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.root, action, ref_type);
- ref->root = generic_ref->tree_ref.root;
+ generic_ref->tree_ref.owning_root, action,
+ ref_type);
+ ref->root = generic_ref->tree_ref.owning_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.root, 0, action, false,
- is_system);
+ generic_ref->tree_ref.owning_root, 0, action,
+ false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
+ u64 ref_root = generic_ref->data_ref.owning_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root) &&
- is_fstree(generic_ref->real_root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e22fba272e4f..91a3aabad150 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Root which refers to this data extent */
- u64 ref_root;
+ /* Original root this data extent belongs to */
+ u64 owning_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which refers to this tree block.
+ * Root which owns this tree block.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 root;
+ u64 owning_root;
/* For non-skinny metadata, no special member needed */
};
@@ -231,17 +231,10 @@ struct btrfs_ref {
*/
bool skip_qgroup;
- /*
- * Optional. For which root is this modification.
- * Mostly used for qgroup optimization.
- *
- * When unset, data/tree ref init code will populate it.
- * In certain cases, we're modifying reference for a different root.
- * E.g. COW fs tree blocks for balance.
- * In that case, tree_ref::root will be fs tree, but we're doing this
- * for reloc tree, then we should set @real_root to reloc tree.
- */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* Through which root is this modification. */
u64 real_root;
+#endif
u64 bytenr;
u64 len;
@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
}
static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root)
+ int level, u64 root, u64 mod_root, bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = root;
+ generic_ref->real_root = mod_root ?: root;
+#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.root = root;
+ generic_ref->tree_ref.owning_root = root;
generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
}
static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset)
+ u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+ bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = ref_root;
- generic_ref->data_ref.ref_root = ref_root;
+ generic_ref->real_root = mod_root ?: ref_root;
+#endif
+ generic_ref->data_ref.owning_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
}
static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d029be40ea6f..c85a7d44da79 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
@@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
@@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ args.devid = src_devid;
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -283,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 355ea88d5c5f..59c3be8c1f4c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -683,7 +683,7 @@ err:
return ret;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror)
{
@@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page)
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
return __set_page_dirty_nobuffers(page);
}
ASSERT(PagePrivate(page) && page->private);
@@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page)
ASSERT(eb);
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
ASSERT(atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ btrfs_assert_tree_write_locked(eb);
free_extent_buffer(eb);
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
goto fail;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ !btrfs_is_data_reloc_root(root)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
+ kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -1953,8 +1954,7 @@ sleep:
wake_up_process(fs_info->cleaner_kthread);
mutex_unlock(&fs_info->transaction_kthread_mutex);
- if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- &fs_info->fs_state)))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_cleanup_transaction(fs_info);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
@@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
/*
* For 4K page size, we only support 4K sector size.
- * For 64K page size, we support read-write for 64K sector size, and
- * read-only for 4K sector size.
+ * For 64K page size, we support 64K and 4K sector sizes.
*/
if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
(PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
+ spin_lock_init(&fs_info->zone_active_bgs_lock);
+ spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
@@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
btrfs_init_btree_inode(fs_info);
- invalidate_bdev(fs_devices->latest_bdev);
+ invalidate_bdev(fs_devices->latest_dev->bdev);
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
if (IS_ERR(disk_super)) {
err = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- if (sectorsize != PAGE_SIZE) {
+ if (sectorsize < PAGE_SIZE) {
+ struct btrfs_subpage_info *subpage_info;
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- }
- if (sectorsize != PAGE_SIZE) {
if (btrfs_super_incompat_flags(fs_info->super_copy) &
BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
@@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
err = -EINVAL;
goto fail_alloc;
}
+ subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ if (!subpage_info)
+ goto fail_alloc;
+ btrfs_init_subpage_info(subpage_info, sectorsize);
+ fs_info->subpage_info = subpage_info;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_bdev) {
+ if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3740,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
else if (ret)
return ERR_PTR(ret);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
@@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
- btrfs_advance_sb_log(device, i);
+
+ if (btrfs_advance_sb_log(device, i))
+ errors++;
}
return errors < i ? 0 : -1;
}
@@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
if (root->reloc_root) {
btrfs_put_root(root->reloc_root);
@@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
- test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
@@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 0e7e9526b6a8..a2b5db4ba262 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ab456cb4bf8..3fd736a02c1e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_bio *bbio = NULL;
-
+ struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
int i;
num_bytes = end - cur;
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bbio, 0);
+ &num_bytes, &bioc, 0);
/*
* Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
* -EOPNOTSUPP. For any such error, @num_bytes is not updated,
@@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
if (ret < 0)
goto out;
- stripe = bbio->stripes;
- for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ stripe = bioc->stripes;
+ for (i = 0; i < bioc->num_stripes; i++, stripe++) {
u64 bytes;
struct btrfs_device *device = stripe->dev;
@@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* And since there are two loops, explicitly
* go to out to avoid confusion.
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
@@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
*/
ret = 0;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
cur += num_bytes;
}
out:
@@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
out:
btrfs_free_path(path);
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
}
@@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset);
- generic_ref.skip_qgroup = for_reloc;
+ key.offset, root->root_key.objectid,
+ for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = fs_info->nodesize;
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
- generic_ref.skip_qgroup = for_reloc;
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+ root->root_key.objectid, for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root->root_key.objectid);
+ root->root_key.objectid, 0, false);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
@@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy {
*/
struct find_free_extent_ctl {
/* Basic allocation info */
+ u64 ram_bytes;
u64 num_bytes;
+ u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
@@ -3495,6 +3494,9 @@ struct find_free_extent_ctl {
/* Allocation is called for tree-log */
bool for_treelog;
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
/* RAID index, converted from flags */
int index;
@@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
u64 avail;
u64 bytenr = block_group->start;
u64 log_bytenr;
+ u64 data_reloc_bytenr;
int ret = 0;
- bool skip;
+ bool skip = false;
ASSERT(btrfs_is_zoned(block_group->fs_info));
@@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
*/
spin_lock(&fs_info->treelog_bg_lock);
log_bytenr = fs_info->treelog_bg;
- skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
- (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+ skip = true;
spin_unlock(&fs_info->treelog_bg_lock);
if (skip)
return 1;
+ /*
+ * Do not allow non-relocation blocks in the dedicated relocation block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->relocation_bg_lock);
+ data_reloc_bytenr = fs_info->data_reloc_bg;
+ if (data_reloc_bytenr &&
+ ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+ (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro ||
+ block_group->alloc_offset == block_group->zone_capacity) {
+ spin_unlock(&block_group->lock);
+ return 1;
+ }
+ spin_unlock(&block_group->lock);
+
+ if (!btrfs_zone_activate(block_group))
+ return 1;
+
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
ASSERT(!ffe_ctl->for_treelog ||
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
+ ASSERT(!ffe_ctl->for_data_reloc ||
+ block_group->start == fs_info->data_reloc_bg ||
+ fs_info->data_reloc_bg == 0);
if (block_group->ro) {
ret = 1;
@@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
goto out;
}
- avail = block_group->length - block_group->alloc_offset;
+ /*
+ * Do not allow currently used block group to be the data relocation
+ * dedicated block group.
+ */
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+ avail = block_group->zone_capacity - block_group->alloc_offset;
if (avail < num_bytes) {
if (ffe_ctl->max_extent_size < avail) {
/*
@@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
fs_info->treelog_bg = block_group->start;
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+
ffe_ctl->found_offset = start + block_group->alloc_offset;
block_group->alloc_offset += num_bytes;
spin_lock(&ctl->tree_lock);
@@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
+ if (ret && ffe_ctl->for_data_reloc)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
@@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
ffe_ctl->orig_have_caching_bg = true;
- if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
- ffe_ctl->have_caching_bg)
- return 1;
-
- if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
- return 1;
-
if (ins->objectid) {
found_extent(ffe_ctl, ins);
return 0;
}
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+ /*
+ * If we have enough free space left in an already active block
+ * group and we can't activate any other zone now, retry the
+ * active ones with a smaller allocation size. Returning early
+ * from here will tell btrfs_reserve_extent() to haven the
+ * size.
+ */
+ return -ENOSPC;
+ }
+
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+ ffe_ctl->index++;
+ if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+ return 1;
+
/*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
@@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
}
+ if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ }
return 0;
default:
BUG();
@@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_root *root,
- u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte_orig, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
- struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
- WARN_ON(num_bytes < fs_info->sectorsize);
-
- ffe_ctl.num_bytes = num_bytes;
- ffe_ctl.empty_size = empty_size;
- ffe_ctl.flags = flags;
- ffe_ctl.search_start = 0;
- ffe_ctl.delalloc = delalloc;
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
- ffe_ctl.have_caching_bg = false;
- ffe_ctl.orig_have_caching_bg = false;
- ffe_ctl.found_offset = 0;
- ffe_ctl.hint_byte = hint_byte_orig;
- ffe_ctl.for_treelog = for_treelog;
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+ WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
+ ffe_ctl->search_start = 0;
/* For clustered allocation */
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- ffe_ctl.last_ptr = NULL;
- ffe_ctl.use_cluster = true;
+ ffe_ctl->empty_cluster = 0;
+ ffe_ctl->last_ptr = NULL;
+ ffe_ctl->use_cluster = true;
+ ffe_ctl->have_caching_bg = false;
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
+ /* For clustered allocation */
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+ ffe_ctl->found_offset = 0;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
if (btrfs_is_zoned(fs_info))
- ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ ffe_ctl->flags);
- space_info = btrfs_find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", flags);
+ btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
return -ENOSPC;
}
- ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
if (ret < 0)
return ret;
- ffe_ctl.search_start = max(ffe_ctl.search_start,
- first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
- if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
+ ffe_ctl->search_start = max(ffe_ctl->search_start,
+ first_logical_byte(fs_info, 0));
+ ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+ if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
- ffe_ctl.search_start);
+ ffe_ctl->search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
- if (block_group && block_group_bits(block_group, flags) &&
+ if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(
- block_group->flags);
- btrfs_lock_block_group(block_group, delalloc);
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
+ btrfs_lock_block_group(block_group,
+ ffe_ctl->delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- ffe_ctl.have_caching_bg = false;
- if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
- ffe_ctl.index == 0)
+ ffe_ctl->have_caching_bg = false;
+ if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ ffe_ctl->index == 0)
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
- &space_info->block_groups[ffe_ctl.index], list) {
+ &space_info->block_groups[ffe_ctl->index], list) {
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro)) {
- if (for_treelog)
+ if (ffe_ctl->for_treelog)
btrfs_clear_treelog_bg(block_group);
+ if (ffe_ctl->for_data_reloc)
+ btrfs_clear_data_reloc_bg(block_group);
continue;
}
- btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->start;
+ btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+ ffe_ctl->search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
- if (!block_group_bits(block_group, flags)) {
+ if (!block_group_bits(block_group, ffe_ctl->flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -4242,7 +4310,7 @@ search:
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
- if ((flags & extra) && !(block_group->flags & extra))
+ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
goto loop;
/*
@@ -4250,14 +4318,14 @@ search:
* It's possible that we have MIXED_GROUP flag but no
* block group is mixed. Just skip such block group.
*/
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
continue;
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_done(block_group);
- if (unlikely(!ffe_ctl.cached)) {
- ffe_ctl.have_caching_bg = true;
+ ffe_ctl->cached = btrfs_block_group_done(block_group);
+ if (unlikely(!ffe_ctl->cached)) {
+ ffe_ctl->have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
/*
@@ -4280,10 +4348,11 @@ have_block_group:
goto loop;
bg_ret = NULL;
- ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
if (ret == 0) {
if (bg_ret && bg_ret != block_group) {
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group,
+ ffe_ctl->delalloc);
block_group = bg_ret;
}
} else if (ret == -EAGAIN) {
@@ -4293,46 +4362,49 @@ have_block_group:
}
/* Checks */
- ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
- fs_info->stripesize);
+ ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (ffe_ctl.search_start + num_bytes >
+ if (ffe_ctl->search_start + ffe_ctl->num_bytes >
block_group->start + block_group->length) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
- if (ffe_ctl.found_offset < ffe_ctl.search_start)
+ if (ffe_ctl->found_offset < ffe_ctl->search_start)
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ ffe_ctl->found_offset,
+ ffe_ctl->search_start - ffe_ctl->found_offset);
- ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
- num_bytes, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ ffe_ctl->num_bytes,
+ ffe_ctl->delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space_unused(block_group,
- ffe_ctl.found_offset, num_bytes);
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = ffe_ctl.search_start;
- ins->offset = num_bytes;
+ ins->objectid = ffe_ctl->search_start;
+ ins->offset = ffe_ctl->num_bytes;
- trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
- num_bytes);
- btrfs_release_block_group(block_group, delalloc);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
- release_block_group(block_group, &ffe_ctl, delalloc);
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4341,12 +4413,12 @@ loop:
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
*/
- if (!ffe_ctl.max_extent_size)
- ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+ if (!ffe_ctl->max_extent_size)
+ ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = ffe_ctl.max_extent_size;
+ space_info->max_extent_size = ffe_ctl->max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = ffe_ctl.max_extent_size;
+ ins->offset = ffe_ctl->max_extent_size;
} else if (ret == -ENOSPC) {
ret = cache_block_group_error;
}
@@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
struct btrfs_key *ins, int is_data, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct find_free_extent_ctl ffe_ctl = {};
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
- hint_byte, ins, flags, delalloc);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.min_alloc_size = min_alloc_size;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.hint_byte = hint_byte;
+ ffe_ctl.for_treelog = for_treelog;
+ ffe_ctl.for_data_reloc = for_data_reloc;
+
+ ret = find_free_extent(root, ins, &ffe_ctl);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
} else if (ret == -ENOSPC) {
@@ -4431,8 +4515,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu tree-log %d",
- flags, num_bytes, for_treelog);
+ "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+ flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, 1);
+ fs_info->nodesize, true);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
extent_key.objectid, extent_key.offset);
@@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins.objectid, ins.offset, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+ root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
@@ -5265,7 +5350,8 @@ skip:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
fs_info->nodesize, parent);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5750,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- btrfs_assert_tree_locked(parent);
+ btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
- btrfs_assert_tree_locked(node);
+ btrfs_assert_tree_write_locked(node);
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aaddd7225348..4e03a6d3aa32 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -241,7 +241,7 @@ int __init extent_io_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
+ offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
goto free_buffer_cache;
@@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes. @Start and @end are used to return the range,
+ * more than @max_bytes.
*
- * Return: true if we find something
- * false if nothing was in the tree
+ * @start: The original start bytenr to search.
+ * Will store the extent range start bytenr.
+ * @end: The original end bytenr of the search range
+ * Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
@@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
u64 *end)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
@@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
int ret;
int loops = 0;
+ /* Caller should pass a valid @end to indicate the search range end */
+ ASSERT(orig_end > orig_start);
+
+ /* The range should at least cover part of the page */
+ ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+ orig_end <= page_offset(locked_page)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
- if (!found || delalloc_end <= *start) {
+ if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
*start = delalloc_start;
- *end = delalloc_end;
+
+ /* @delalloc_end can be -1, never go beyond @orig_end */
+ *end = min(delalloc_end, orig_end);
free_extent_state(cached_state);
return false;
}
@@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_is_zoned(fs_info))
return btrfs_repair_one_zone(fs_info, logical);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
@@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bbio, 0);
+ &map_length, &bioc, 0);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- ASSERT(bbio->mirror_num == 1);
+ ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
+ &map_length, &bioc, mirror_num);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
- BUG_ON(mirror_num != bbio->mirror_num);
+ BUG_ON(mirror_num != bioc->mirror_num);
}
- sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[bbio->mirror_num - 1].dev;
- btrfs_put_bbio(bbio);
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
@@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
- struct btrfs_io_bio *repair_io_bio;
+ struct btrfs_bio *repair_bbio;
blk_status_t status;
btrfs_debug(fs_info,
@@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode,
return -EIO;
}
- repair_bio = btrfs_io_bio_alloc(1);
- repair_io_bio = btrfs_io_bio(repair_bio);
+ repair_bio = btrfs_bio_alloc(1);
+ repair_bbio = btrfs_bio(repair_bio);
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
- if (failed_io_bio->csum) {
+ if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
- repair_io_bio->csum = repair_io_bio->csum_inline;
- memcpy(repair_io_bio->csum,
- failed_io_bio->csum + csum_size * icsum, csum_size);
+ repair_bbio->csum = repair_bbio->csum_inline;
+ memcpy(repair_bbio->csum,
+ failed_bbio->csum + csum_size * icsum, csum_size);
}
bio_add_page(repair_bio, page, failrec->len, pgoff);
- repair_io_bio->logical = failrec->start;
- repair_io_bio->iter = repair_bio->bi_iter;
+ repair_bbio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
"repair read error: submitting new read to mirror %d",
@@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
@@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio)
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
- io_bio->mirror_num);
+ bbio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio)
end = start + bvec->bv_len - 1;
len = bvec->bv_len;
- mirror = io_bio->mirror_num;
+ mirror = bbio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode)) {
- error_bitmap = btrfs_verify_data_csum(io_bio,
+ error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
ret = error_bitmap;
} else {
- ret = btrfs_validate_metadata_buffer(io_bio,
+ ret = btrfs_validate_metadata_buffer(bbio,
page, start, end, mirror);
}
if (ret)
@@ -3106,7 +3123,7 @@ readpage_ok:
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
- btrfs_io_bio_free_csum(io_bio);
+ btrfs_bio_free_csum(bbio);
bio_put(bio);
}
@@ -3115,53 +3132,43 @@ readpage_ok:
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
* 'bio' because use of __GFP_ZERO is not supported.
*/
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+static inline void btrfs_bio_init(struct btrfs_bio *bbio)
{
- memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
}
/*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail. We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+ * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ *
+ * The bio allocation is backed by bioset and does not fail.
*/
-struct bio *btrfs_bio_alloc(u64 first_byte)
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
- bio->bi_iter.bi_sector = first_byte >> 9;
- btrfs_io_bio_init(btrfs_io_bio(bio));
+ ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio));
return bio;
}
struct bio *btrfs_bio_clone(struct bio *bio)
{
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
- btrfs_bio = btrfs_io_bio(new);
- btrfs_io_bio_init(btrfs_bio);
- btrfs_bio->iter = bio->bi_iter;
+ bbio = btrfs_bio(new);
+ btrfs_bio_init(bbio);
+ bbio->iter = bio->bi_iter;
return new;
}
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- /* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
- btrfs_io_bio_init(btrfs_io_bio(bio));
- return bio;
-}
-
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
- struct btrfs_io_bio *btrfs_bio;
+ struct btrfs_bio *bbio;
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
@@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_io_bio_init(btrfs_bio);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio);
bio_trim(bio, offset >> 9, size >> 9);
- btrfs_bio->iter = bio->bi_iter;
+ bbio->iter = bio->bi_iter;
return bio;
}
@@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode,
struct bio *bio;
int ret;
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
if (bio_flags & EXTENT_BIO_COMPRESSED)
- bio = btrfs_bio_alloc(disk_bytenr);
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
- bio = btrfs_bio_alloc(disk_bytenr + offset);
+ bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
@@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
if (wbc) {
struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_dev->bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
}
@@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
goto error;
}
- btrfs_io_bio(bio)->device = device;
+ btrfs_bio(bio)->device = device;
}
return 0;
error:
@@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
bool force_bio_submit = false;
u64 disk_bytenr;
+ ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
struct extent_state *cached = NULL;
@@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc,
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
- u64 delalloc_start, unsigned long *nr_written)
+ unsigned long *nr_written)
{
- u64 page_end = delalloc_start + PAGE_SIZE - 1;
- bool found;
+ const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 delalloc_start = page_offset(page);
u64 delalloc_to_write = 0;
- u64 delalloc_end = 0;
int ret;
int page_started = 0;
+ while (delalloc_start < page_end) {
+ u64 delalloc_end = page_end;
+ bool found;
- while (delalloc_end < page_end) {
found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
@@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
- unsigned long dirty_bitmap;
unsigned long flags;
- int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
- int range_start_bit = nbits;
+ int range_start_bit;
int range_end_bit;
/*
@@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
return;
}
+ range_start_bit = spi->dirty_offset +
+ (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
/* We should have the page locked, but just in case */
spin_lock_irqsave(&subpage->lock, flags);
- dirty_bitmap = subpage->dirty_bitmap;
+ bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+ spi->dirty_offset + spi->bitmap_nr_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
- BTRFS_SUBPAGE_BITMAP_SIZE);
+ range_start_bit -= spi->dirty_offset;
+ range_end_bit -= spi->dirty_offset;
+
*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}
@@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct inode *inode = page->mapping->host;
- u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
@@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (!epd->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
- &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
if (ret == 1)
return 0;
if (ret)
@@ -4141,8 +4155,20 @@ done:
* capable of that.
*/
if (PageError(page))
- end_extent_writepage(page, ret, start, page_end);
- unlock_page(page);
+ end_extent_writepage(page, ret, page_start, page_end);
+ if (epd->extent_locked) {
+ /*
+ * If epd->extent_locked, it's from extent_write_locked_range(),
+ * the page can either be locked by lock_page() or
+ * process_one_page().
+ * Let btrfs_page_unlock_writer() handle both cases.
+ */
+ ASSERT(wbc);
+ btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+ wbc->range_end + 1 - wbc->range_start);
+ } else {
+ unlock_page(page);
+ }
ASSERT(ret <= 0);
return ret;
}
@@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
+ if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+ btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page,
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
- const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
int ret;
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < nbits) {
+ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
struct extent_buffer *eb;
unsigned long flags;
@@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page,
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+ if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
bit_start++;
@@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
free_extent_buffer(eb);
return ret;
}
- if (cache)
+ if (cache) {
+ /* Impiles write in zoned mode */
btrfs_put_block_group(cache);
+ /* Mark the last eb in a block group */
+ if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+ set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ }
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
@@ -4873,7 +4907,7 @@ retry:
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*/
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (!BTRFS_FS_ERROR(fs_info)) {
ret = flush_write_bio(&epd);
} else {
ret = -EROFS;
@@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
return ret;
}
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
{
+ bool found_error = false;
+ int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_SIZE) >>
- PAGE_SHIFT;
-
+ u64 cur = start;
+ unsigned long nr_pages;
+ const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 1,
- .sync_io = mode == WB_SYNC_ALL,
+ .sync_io = 1,
};
struct writeback_control wbc_writepages = {
- .sync_mode = mode,
- .nr_to_write = nr_pages * 2,
+ .sync_mode = WB_SYNC_ALL,
.range_start = start,
.range_end = end + 1,
/* We're called from an async helper function */
@@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.no_cgroup_owner = 1,
};
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+ PAGE_SHIFT;
+ wbc_writepages.nr_to_write = nr_pages * 2;
+
wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
- while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_SHIFT);
- if (clear_page_dirty_for_io(page))
- ret = __extent_writepage(page, &wbc_writepages, &epd);
- else {
- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
- page, start, start + PAGE_SIZE - 1, true);
- unlock_page(page);
+ while (cur <= end) {
+ u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+ page = find_get_page(mapping, cur >> PAGE_SHIFT);
+ /*
+ * All pages in the range are locked since
+ * btrfs_run_delalloc_range(), thus there is no way to clear
+ * the page dirty flag.
+ */
+ ASSERT(PageLocked(page));
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ found_error = true;
+ first_error = ret;
}
put_page(page);
- start += PAGE_SIZE;
+ cur = cur_end + 1;
}
- ASSERT(ret <= 0);
- if (ret == 0)
+ if (!found_error)
ret = flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
wbc_detach_inode(&wbc_writepages);
+ if (found_error)
+ return first_error;
return ret;
}
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct inode *inode = mapping->host;
+ const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
+ const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
int ret = 0;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
@@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ /*
+ * Allow only a single thread to do the reloc work in zoned mode to
+ * protect the write pointer updates.
+ */
+ if (data_reloc && zoned)
+ btrfs_inode_lock(inode, 0);
ret = extent_write_cache_pages(mapping, wbc, &epd);
+ if (data_reloc && zoned)
+ btrfs_inode_unlock(inode, 0);
ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
@@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- ret = btrfs_alloc_subpage(fs_info, &prealloc,
- BTRFS_SUBPAGE_METADATA);
- if (ret < 0) {
- unlock_page(p);
- put_page(p);
- exists = ERR_PTR(ret);
- goto free_eb;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ ret = PTR_ERR(prealloc);
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
}
spin_lock(&mapping->private_lock);
@@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
- int ret;
- int i;
+ u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
- ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
lockdep_assert_held(&fs_info->buffer_lock);
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
- bytenr >> fs_info->sectorsize_bits,
- PAGE_SIZE / fs_info->nodesize);
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
- break;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- break;
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
}
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
+out:
return found;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 53abdc280451..0399cf8e3c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
+ EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a8e02f7b6c7..5a36add21305 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 0b9401a5afd3..d1cbb64a78f3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
- * btrfs_io_bio(bio)->csum instead.
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
@@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return BLK_STS_RESOURCE;
if (!dst) {
- struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
- GFP_NOFS);
- if (!btrfs_bio->csum) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
} else {
- btrfs_bio->csum = btrfs_bio->csum_inline;
+ bbio->csum = bbio->csum_inline;
}
- csum = btrfs_bio->csum;
+ csum = bbio->csum;
} else {
csum = dst;
}
@@ -709,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
index = 0;
}
- data = kmap_atomic(bvec.bv_page);
- crypto_shash_digest(shash, data + bvec.bv_offset
- + (i * fs_info->sectorsize),
+ data = bvec_kmap_local(&bvec);
+ crypto_shash_digest(shash,
+ data + (i * fs_info->sectorsize),
fs_info->sectorsize,
sums->sums + index);
- kunmap_atomic(data);
+ kunmap_local(data);
index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a1762363f61f..11204dbbe053 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+ struct page **pages, size_t num_pages,
+ u64 pos, u64 copied)
{
size_t i;
+ u64 block_start = round_down(pos, fs_info->sectorsize);
+ u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+ ASSERT(block_len <= U32_MAX);
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
@@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- ClearPageChecked(pages[i]);
+ btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+ block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct page *p = pages[i];
btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- ClearPageChecked(p);
+ btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
@@ -869,7 +876,8 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- args->start - extent_offset);
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
@@ -955,7 +963,8 @@ delete_extent_item:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
- key.offset - extent_offset);
+ key.offset - extent_offset, 0,
+ false);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
args->bytes_found += extent_end - key.offset;
@@ -1020,8 +1029,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &args->extent_item_size, 1);
+ btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
args->extent_inserted = true;
}
@@ -1232,7 +1240,7 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset);
+ orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1257,7 +1265,8 @@ again:
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+ 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1709,7 +1718,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
ret = -EFAULT;
break;
}
@@ -1844,7 +1853,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
break;
}
@@ -1852,7 +1861,7 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
cond_resched();
@@ -1903,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
+ const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos;
ssize_t written = 0;
ssize_t written_buffered;
+ size_t prev_left = 0;
loff_t endbyte;
ssize_t err;
unsigned int ilock_flags = 0;
- struct iomap_dio *dio = NULL;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1955,23 +1965,80 @@ relock:
goto buffered;
}
- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- 0);
+ /*
+ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
+ * calls generic_write_sync() (through iomap_dio_complete()), because
+ * that results in calling fsync (btrfs_sync_file()) which will try to
+ * lock the inode in exclusive/write mode.
+ */
+ if (is_sync_write)
+ iocb->ki_flags &= ~IOCB_DSYNC;
- btrfs_inode_unlock(inode, ilock_flags);
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+again:
+ from->nofault = true;
+ err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, written);
+ from->nofault = false;
- if (IS_ERR_OR_NULL(dio)) {
- err = PTR_ERR_OR_ZERO(dio);
- if (err < 0 && err != -ENOTBLK)
- goto out;
- } else {
- written = iomap_dio_complete(dio);
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (err > 0)
+ written = err;
+
+ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ err = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto again;
+ }
}
- if (written < 0 || !iov_iter_count(from)) {
- err = written;
+ btrfs_inode_unlock(inode, ilock_flags);
+
+ /*
+ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
+ * the fsync (call generic_write_sync()).
+ */
+ if (is_sync_write)
+ iocb->ki_flags |= IOCB_DSYNC;
+
+ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
+ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
goto out;
- }
buffered:
pos = iocb->ki_pos;
@@ -1996,7 +2063,7 @@ buffered:
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out:
- return written ? written : err;
+ return err < 0 ? err : written;
}
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -2012,7 +2079,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+ if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2620,7 +2687,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset);
+ btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -3650,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
ssize_t ret;
if (fsverity_active(inode))
@@ -3659,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- return ret;
+ return ret < 0 ? ret : read;
}
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da0eee7c9e5f..f3fee88c8ee0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "subpage.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- ClearPageChecked(io_ctl->pages[i]);
+ btrfs_page_clear_checked(io_ctl->fs_info,
+ io_ctl->pages[i],
+ page_offset(io_ctl->pages[i]),
+ PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
@@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
+ else if (initial)
+ to_free = block_group->zone_capacity;
else if (offset >= block_group->alloc_offset)
to_free = size;
else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
spin_unlock(&block_group->lock);
}
+ reclaimable_unusable = block_group->zone_unusable -
+ (block_group->length - block_group->zone_capacity);
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
- block_group->zone_unusable >=
- div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+ reclaimable_unusable >=
+ div_factor_fine(block_group->zone_capacity,
+ bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
* out the free space after the allocation offset.
*/
if (btrfs_is_zoned(fs_info)) {
- btrfs_info(fs_info, "free space %llu",
- block_group->length - block_group->alloc_offset);
+ btrfs_info(fs_info, "free space %llu active %d",
+ block_group->zone_capacity - block_group->alloc_offset,
+ block_group->zone_is_active);
return;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7c096ab9bb5e..b8c911a4a320 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,6 +6,7 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -456,11 +457,10 @@ struct async_chunk {
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
- atomic_t *pending;
+ struct async_cow *async_cow;
};
struct async_cow {
- /* Number of chunks in flight; must be first in the structure */
atomic_t num_chunks;
struct async_chunk chunks[];
};
@@ -491,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
- /* Subpage doesn't support compression yet */
- if (inode->root->fs_info->sectorsize < PAGE_SIZE)
- return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
@@ -515,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
btrfs_ino(inode));
return 0;
}
+ /*
+ * Special check for subpage.
+ *
+ * We lock the full page then run each delalloc range in the page, thus
+ * for the following case, we will hit some subpage specific corner case:
+ *
+ * 0 32K 64K
+ * | |///////| |///////|
+ * \- A \- B
+ *
+ * In above case, both range A and range B will try to unlock the full
+ * page [0, 64K), causing the one finished later will have page
+ * unlocked already, triggering various page lock requirement BUG_ON()s.
+ *
+ * So here we add an artificial limit that subpage compression can only
+ * if the range is fully page aligned.
+ *
+ * In theory we only need to ensure the first page is fully covered, but
+ * the tailing partial page will be locked until the full compression
+ * finishes, delaying the write of other range.
+ *
+ * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ * first to prevent any submitted async extent to unlock the full page.
+ * By this, we can ensure for subpage case that only the last async_cow
+ * will unlock the full page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(end + 1, PAGE_SIZE))
+ return 0;
+ }
+
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -616,13 +645,24 @@ again:
total_compressed = actual_end - start;
/*
- * skip compression for a small file range(<=blocksize) that
+ * Skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
goto cleanup_and_bail_uncompressed;
+ /*
+ * For subpage case, we require full page alignment for the sector
+ * aligned range.
+ * Thus we must also check against @actual_end, not just @end.
+ */
+ if (blocksize < PAGE_SIZE) {
+ if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ goto cleanup_and_bail_uncompressed;
+ }
+
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -760,7 +800,7 @@ cont:
* win, compare the page count read with the blocks on disk,
* compression must free at least one sector size
*/
- total_in = ALIGN(total_in, PAGE_SIZE);
+ total_in = round_up(total_in, fs_info->sectorsize);
if (total_compressed + blocksize <= total_in) {
compressed_extents++;
@@ -841,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent)
async_extent->pages = NULL;
}
-/*
- * phase two of compressed writeback. This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued. We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
{
- struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct async_extent *async_extent;
- u64 alloc_hint = 0;
- struct btrfs_key ins;
- struct extent_map *em;
- struct btrfs_root *root = inode->root;
- struct extent_io_tree *io_tree = &inode->io_tree;
- int ret = 0;
-
-again:
- while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
- list_del(&async_extent->list);
-
-retry:
- lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1);
- /* did the compression code fall back to uncompressed IO? */
- if (!async_extent->pages) {
- int page_started = 0;
- unsigned long nr_written = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+ unsigned long nr_written = 0;
+ int page_started = 0;
+ int ret;
- /* allocate blocks */
- ret = cow_file_range(inode, async_chunk->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ /*
+ * Call cow_file_range() to run the delalloc range directly, since we
+ * won't go to NOCOW or async path again.
+ *
+ * Also we call cow_file_range() with @unlock_page == 0, so that we
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ &nr_written, 0);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ if (locked_page)
+ unlock_page(locked_page);
+ goto out;
+ }
- /* JDM XXX */
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ /* All pages will be unlocked, including @locked_page */
+out:
+ kfree(async_extent);
+ return ret;
+}
- /*
- * if page_started, cow_file_range inserted an
- * inline extent and took care of all the unlocking
- * and IO for us. Otherwise, we need to submit
- * all those pages down to the drive.
- */
- if (!page_started && !ret)
- extent_write_locked_range(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- WB_SYNC_ALL);
- else if (ret && async_chunk->locked_page)
- unlock_page(async_chunk->locked_page);
- kfree(async_extent);
- cond_resched();
- continue;
- }
+static int submit_one_async_extent(struct btrfs_inode *inode,
+ struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
+{
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct page *locked_page = NULL;
+ struct extent_map *em;
+ int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
- ret = btrfs_reserve_extent(root, async_extent->ram_size,
- async_extent->compressed_size,
- async_extent->compressed_size,
- 0, alloc_hint, &ins, 1, 1);
- if (ret) {
- free_async_extent_pages(async_extent);
+ /*
+ * If async_chunk->locked_page is in the async_extent range, we need to
+ * handle it.
+ */
+ if (async_chunk->locked_page) {
+ u64 locked_page_start = page_offset(async_chunk->locked_page);
+ u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
- if (ret == -ENOSPC) {
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ if (!(start >= locked_page_end || end <= locked_page_start))
+ locked_page = async_chunk->locked_page;
+ }
+ lock_extent(io_tree, start, end);
- /*
- * we need to redirty the pages if we decide to
- * fallback to uncompressed IO, otherwise we
- * will not submit these pages down to lower
- * layers.
- */
- extent_range_redirty_for_io(&inode->vfs_inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
+ /* We have fall back to uncompressed write */
+ if (!async_extent->pages)
+ return submit_uncompressed_range(inode, async_extent, locked_page);
- goto retry;
- }
- goto out_free;
- }
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, *alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
/*
- * here we're doing allocation and writeback of the
- * compressed pages
+ * Here we used to try again by going back to non-compressed
+ * path for ENOSPC. But we can't reserve space even for
+ * compressed size, how could it work for uncompressed size
+ * which requires larger size? So here we directly go error
+ * path.
*/
- em = create_io_em(inode, async_extent->start,
- async_extent->ram_size, /* len */
- async_extent->start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
- if (IS_ERR(em))
- /* ret value is not necessary due to void function */
- goto out_free_reserve;
- free_extent_map(em);
-
- ret = btrfs_add_ordered_extent_compress(inode,
- async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- async_extent->compress_type);
- if (ret) {
- btrfs_drop_extent_cache(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, 0);
- goto out_free_reserve;
- }
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ goto out_free;
+ }
+
+ /* Here we're doing allocation and writeback of the compressed pages */
+ em = create_io_em(inode, start,
+ async_extent->ram_size, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ async_extent->ram_size, /* ram_bytes */
+ async_extent->compress_type,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserve;
+ }
+ free_extent_map(em);
- /*
- * clear dirty, set writeback and unlock the pages.
- */
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_START_WRITEBACK);
- if (btrfs_submit_compressed_write(inode, async_extent->start,
- async_extent->ram_size,
- ins.objectid,
- ins.offset, async_extent->pages,
- async_extent->nr_pages,
- async_chunk->write_flags,
- async_chunk->blkcg_css)) {
- struct page *p = async_extent->pages[0];
- const u64 start = async_extent->start;
- const u64 end = start + async_extent->ram_size - 1;
-
- p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, false);
-
- p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
- PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
- free_async_extent_pages(async_extent);
- }
- alloc_hint = ins.objectid + ins.offset;
- kfree(async_extent);
- cond_resched();
+ ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */
+ ins.objectid, /* disk_bytenr */
+ async_extent->ram_size, /* num_bytes */
+ ins.offset, /* disk_num_bytes */
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserve;
}
- return;
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /* Clear dirty, set writeback and unlock the pages. */
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* compressed_len */
+ async_extent->pages, /* compressed_pages */
+ async_extent->nr_pages,
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ *alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ return ret;
+
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -1008,7 +1030,39 @@ out_free:
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
- goto again;
+ return ret;
+}
+
+/*
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ int ret = 0;
+
+ while (!list_empty(&async_chunk->extents)) {
+ u64 extent_start;
+ u64 ram_size;
+
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ extent_start = async_extent->start;
+ ram_size = async_extent->ram_size;
+
+ ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ &alloc_hint);
+ btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), extent_start, ram_size, ret);
+ }
}
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1151,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* fails during the stage where it updates the bytenr of file extent
* items.
*/
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes;
else
min_alloc_size = fs_info->sectorsize;
@@ -1187,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_drop_extent_cache;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
/*
@@ -1326,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
- /*
- * Since the pointer to 'pending' is at the beginning of the array of
- * async_chunk's, freeing it ensures the whole array has been freed.
- */
- if (atomic_dec_and_test(async_chunk->pending))
- kvfree(async_chunk->pending);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
}
static int cow_file_range_async(struct btrfs_inode *inode,
@@ -1398,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
* lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
- async_chunk[i].pending = &ctx->num_chunks;
+ async_chunk[i].async_cow = ctx;
async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
@@ -1471,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
__set_page_dirty_nobuffers(locked_page);
account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
@@ -1504,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
- const bool is_reloc_ino = (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID);
+ const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
@@ -1867,8 +1918,7 @@ out_check:
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
nocow = false;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler
@@ -1947,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
int ret;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
+ /*
+ * The range must cover part of the @locked_page, or the returned
+ * @page_started can confuse the caller.
+ */
+ ASSERT(!(end <= page_offset(locked_page) ||
+ start >= page_offset(locked_page) + PAGE_SIZE));
+
if (should_nocow(inode, start, end)) {
- ASSERT(!zoned);
+ /*
+ * Normally on a zoned device we're only doing COW writes, but
+ * in case of relocation on a zoned filesystem we have taken
+ * precaution, that we're only writing sequentially. It's safe
+ * to use run_delalloc_nocow() here, like for regular
+ * preallocated inodes.
+ */
+ ASSERT(!zoned ||
+ (zoned && btrfs_is_data_reloc_root(inode->root)));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
} else if (!inode_can_compress(inode) ||
@@ -2207,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2235,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
/*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = bio->bi_iter.bi_sector << 9;
- u32 bio_len = bio->bi_iter.bi_size;
- struct extent_map *em;
- int ret = 0;
- struct btrfs_io_geometry geom;
-
- if (bio_flags & EXTENT_BIO_COMPRESSED)
- return 0;
-
- em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
- if (IS_ERR(em))
- return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
- if (ret < 0)
- goto out;
-
- if (geom.len < bio_len + size)
- ret = 1;
-out:
- free_extent_map(em);
- return ret;
-}
-
-/*
* in order to insert checksums into the metadata in large chunks,
* we wait until bio submission time. All the pages in the bio are
* checksummed and sums are attached onto the ordered extent record.
@@ -2532,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto mapit;
} else if (async && !skip_sum) {
/* csum items have already been cloned */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@ -2765,7 +2788,7 @@ out_page:
clear_page_dirty_for_io(page);
SetPageError(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2820,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- SetPageChecked(page);
+ btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -3011,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->bdev)
+ /* A valid bdev implies a write on a sequential zone */
+ if (ordered_extent->bdev) {
btrfs_rewrite_logical_zoned(ordered_extent);
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
btrfs_free_io_failure_record(inode, start, end);
@@ -3209,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
*
* The length of such check is always one sector size.
*/
-static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff,
u64 start)
{
@@ -3225,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
ASSERT(pgoff + len <= PAGE_SIZE);
offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+ csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
@@ -3239,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
- if (io_bio->device)
- btrfs_dev_stat_inc_and_print(io_bio->device,
+ bbio->mirror_num);
+ if (bbio->device)
+ btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -3261,33 +3288,29 @@ zeroit:
* Return a bitmap where bit set means a csum mismatch, and bit not set means
* csum match.
*/
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
- if (PageChecked(page)) {
- ClearPageChecked(page);
+ if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+ btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
return 0;
}
/*
- * For subpage case, above PageChecked is not safe as it's not subpage
- * compatible.
- * But for now only cow fixup and compressed read utilize PageChecked
- * flag, while in this context we can easily use io_bio->csum to
- * determine if we really need to do csum verification.
- *
- * So for now, just exit if io_bio->csum is NULL, as it means it's
- * compressed read, and its compressed data csum has already been
- * verified.
+ * This only happens for NODATASUM or compressed read.
+ * Normally this should be covered by above check for compressed read
+ * or the next check for NODATASUM. Just do a quicker exit here.
*/
- if (io_bio->csum == NULL)
+ if (bbio->csum == NULL)
return 0;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@ -3304,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
u64 file_offset = pg_off + page_offset(page);
int ret;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (btrfs_is_data_reloc_root(root) &&
test_range_bit(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM, 1, NULL)) {
@@ -3314,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
@@ -4005,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
@@ -4035,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
* also drops the back refs in the inode to the directory
*/
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const char *name, int name_len)
{
+ struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
@@ -4099,19 +4122,9 @@ skip_backref:
goto err;
}
- ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto err;
- }
-
- ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
- if (ret == -ENOENT)
- ret = 0;
- else if (ret)
- btrfs_abort_transaction(trans, ret);
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
/*
* If we have a pending delayed iput we could end up with the final iput
@@ -4139,15 +4152,14 @@ out:
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode->root, inode);
}
return ret;
}
@@ -4176,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
@@ -4188,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (ret)
@@ -4202,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return ret;
}
@@ -4369,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
@@ -4553,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int err = 0;
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
@@ -4578,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (!err) {
@@ -4599,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return err;
}
@@ -4908,9 +4918,9 @@ delete:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
extent_start, extent_num_bytes, 0);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- ino, extent_offset);
+ ino, extent_offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -5106,7 +5116,8 @@ again:
len);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, block_start,
+ block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
@@ -6436,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
- int nitems = name ? 2 : 1;
+ struct btrfs_item_batch batch;
unsigned long ptr;
unsigned int nofs_flag;
int ret;
@@ -6528,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+ batch.keys = &key[0];
+ batch.data_sizes = &sizes[0];
+ batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+ batch.nr = name ? 2 : 1;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
if (ret != 0)
goto fail_unlock;
@@ -7962,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@ -8039,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->logical_offset,
+ dip->file_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
- dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
+ dip->file_offset,
+ dip->file_offset + dip->bytes - 1);
}
bio_endio(dip->dio_bio);
@@ -8073,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
return ret;
}
-static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
- struct btrfs_io_bio *io_bio,
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ struct btrfs_bio *bbio,
const bool uptodate)
{
+ struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8084,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- u64 start = io_bio->logical;
+ const u64 orig_file_offset = dip->file_offset;
+ u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
- __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
unsigned int i, nr_sectors, pgoff;
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@ -8096,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
- (!csum || !check_data_csum(inode, io_bio,
+ (!csum || !check_data_csum(inode, bbio,
bio_offset, bvec.bv_page,
pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
@@ -8106,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
} else {
int ret;
- ASSERT((start - io_bio->logical) < UINT_MAX);
+ ASSERT((start - orig_file_offset) < UINT_MAX);
ret = btrfs_repair_one_sector(inode,
- &io_bio->bio,
- start - io_bio->logical,
+ &bbio->bio,
+ start - orig_file_offset,
bvec.bv_page, pgoff,
- start, io_bio->mirror_num,
+ start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
@@ -8152,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (bio_op(bio) == REQ_OP_READ) {
- err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
- !err);
- }
+ if (bio_op(bio) == REQ_OP_READ)
+ err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8202,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
} else {
u64 csum_offset;
- csum_offset = file_offset - dip->logical_offset;
+ csum_offset = file_offset - dip->file_offset;
csum_offset >>= fs_info->sectorsize_bits;
csum_offset *= fs_info->csum_size;
- btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = dip->csums + csum_offset;
}
map:
ret = btrfs_map_bio(fs_info, bio, 0);
@@ -8240,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return NULL;
dip->inode = inode;
- dip->logical_offset = file_offset;
+ dip->file_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
@@ -8248,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
@@ -8278,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
- return BLK_QC_T_NONE;
+ return;
}
if (!write) {
@@ -8321,7 +8336,6 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8372,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
free_extent_map(em);
} while (submit_len > 0);
- return BLK_QC_T_NONE;
+ return;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
-
- return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -8697,9 +8709,9 @@ next:
* did something wrong.
*/
ASSERT(!PageOrdered(page));
+ btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8843,7 +8855,7 @@ again:
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -9153,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
WARN_ON(inode->block_rsv.reserved);
WARN_ON(inode->block_rsv.size);
WARN_ON(inode->outstanding_extents);
- WARN_ON(inode->delalloc_bytes);
- WARN_ON(inode->new_delalloc_bytes);
+ if (!S_ISDIR(vfs_inode->i_mode)) {
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ }
WARN_ON(inode->csum_bytes);
WARN_ON(inode->defrag_bytes);
@@ -9451,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -9467,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
- ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9742,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
*/
btrfs_pin_log_trans(root);
log_pinned = true;
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -9762,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
- ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9980,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
};
struct btrfs_fs_info *fs_info = root->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -9999,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
struct list_head splice;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
INIT_LIST_HEAD(&splice);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc61813213d8..fb8cc9642ac4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "subpage.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
- __u64 reserved[4]; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@ -985,129 +987,32 @@ out:
return ret;
}
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
-{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 end;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- read_unlock(&em_tree->lock);
-
- if (em) {
- end = extent_map_end(em);
- free_extent_map(em);
- if (end - offset > thresh)
- return 0;
- }
- /* if we already have a nice delalloc here, just stop */
- thresh /= 2;
- end = count_range_bits(io_tree, &offset, offset + thresh,
- thresh, EXTENT_DELALLOC, 1);
- if (end >= thresh)
- return 0;
- return 1;
-}
-
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
- struct inode *inode, u64 newer_than,
- u64 *off, u32 thresh)
-{
- struct btrfs_path *path;
- struct btrfs_key min_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- int type;
- int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- min_key.objectid = ino;
- min_key.type = BTRFS_EXTENT_DATA_KEY;
- min_key.offset = *off;
-
- while (1) {
- ret = btrfs_search_forward(root, &min_key, path, newer_than);
- if (ret != 0)
- goto none;
-process_slot:
- if (min_key.objectid != ino)
- goto none;
- if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- goto none;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG &&
- btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- check_defrag_in_cache(inode, min_key.offset, thresh)) {
- *off = min_key.offset;
- btrfs_free_path(path);
- return 0;
- }
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- goto process_slot;
- }
-
- if (min_key.offset == (u64)-1)
- goto none;
-
- min_key.offset++;
- btrfs_release_path(path);
- }
-none:
- btrfs_free_path(path);
- return -ENOENT;
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_SIZE;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ u64 end = start + sectorsize - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- unlock_extent_cached(io_tree, start, end, &cached);
+ if (!locked)
+ lock_extent_bits(io_tree, start, end, &cached);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ if (!locked)
+ unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
return em;
}
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ bool locked)
{
struct extent_map *next;
bool ret = true;
@@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len);
+ next = defrag_lookup_extent(inode, em->start + em->len, locked);
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
@@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end,
- int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
{
- struct extent_map *em;
- int ret = 1;
- bool next_mergeable = true;
- bool prev_mergeable = true;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
/*
- * make sure that once we start defragging an extent, we keep on
- * defragging it
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
*/
- if (start < *defrag_end)
- return 1;
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
- *skip = 0;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
- em = defrag_lookup_extent(inode, start);
- if (!em)
- return 0;
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
- /* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- ret = 0;
- goto out;
- }
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
- if (!*defrag_end)
- prev_mergeable = false;
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
- next_mergeable = defrag_check_next_extent(inode, em);
- /*
- * we hit a real extent, if it is big or the next extent is not a
- * real extent, don't bother defragging it
- */
- if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- ret = 0;
-out:
/*
- * last_len ends up being a counter of how many bytes we've defragged.
- * every time we choose not to defrag an extent, we reset *last_len
- * so that the next tiny extent will force a defrag.
- *
- * The end result of this is that tiny extents before a single big
- * extent will force at least part of that big extent to be defragged.
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
*/
- if (ret) {
- *defrag_end = extent_map_end(em);
- } else {
- *last_len = 0;
- *skip = extent_map_end(em);
- *defrag_end = 0;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
}
-
- free_extent_map(em);
- return ret;
+ return page;
}
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
/*
- * it doesn't do much good to defrag one or two pages
- * at a time. This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
*
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
*/
-static int cluster_pages_for_defrag(struct inode *inode,
- struct page **pages,
- unsigned long start_index,
- unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list)
{
- unsigned long file_end;
- u64 isize = i_size_read(inode);
- u64 page_start;
- u64 page_end;
- u64 page_cnt;
- u64 start = (u64)start_index << PAGE_SHIFT;
- u64 search_start;
- int ret;
- int i;
- int i_done;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_io_tree *tree;
- struct extent_changeset *data_reserved = NULL;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ u64 cur = start;
+ int ret = 0;
- file_end = (isize - 1) >> PAGE_SHIFT;
- if (!isize || start_index > file_end)
- return 0;
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
- page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ if (!em)
+ break;
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start, page_cnt << PAGE_SHIFT);
- if (ret)
- return ret;
- i_done = 0;
- tree = &BTRFS_I(inode)->io_tree;
+ /* Skip hole/inline/preallocated extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
- /* step one, lock all the pages */
- for (i = 0; i < page_cnt; i++) {
- struct page *page;
-again:
- page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
- if (!page)
- break;
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- break;
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (em->len >= extent_thresh)
+ goto next;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
}
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- while (1) {
- lock_extent_bits(tree, page_start, page_end,
- &cached_state);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
- page_start);
- unlock_extent_cached(tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * we unlocked the page above, so we need check if
- * it was released or not.
- */
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+add:
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
}
+ /* Fall through to allocate a new entry */
}
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- ret = -EIO;
- break;
- }
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
}
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 0, 0, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
- pages[i] = page;
- i_done++;
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
}
- if (!i_done || ret)
- goto out;
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- goto out;
+ return ret;
+}
- /*
- * so now we have a nice long stream of locked
- * and up to date pages, lets wait on them
- */
- for (i = 0; i < i_done; i++)
- wait_on_page_writeback(pages[i]);
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
- page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
- lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+ /* Lock the pages range */
+ lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
- * When defragmenting we skip ranges that have holes or inline extents,
- * (check should_defrag_range()), to avoid unnecessary IO and wasting
- * space. At btrfs_defrag_file(), we check if a range should be defragged
- * before locking the inode and then, if it should, we trigger a sync
- * page cache readahead - we lock the inode only after that to avoid
- * blocking for too long other tasks that possibly want to operate on
- * other file ranges. But before we were able to get the inode lock,
- * some other task may have punched a hole in the range, or we may have
- * now an inline extent, in which case we should not defrag. So check
- * for that here, where we have the inode and the range locked, and bail
- * out if that happened.
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
*/
- search_start = page_start;
- while (search_start < page_end) {
- struct extent_map *em;
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list);
+ if (ret < 0)
+ goto unlock_extent;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- page_end - search_start);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock_range;
- }
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- /* Ok, 0 means we did not defrag anything */
- ret = 0;
- goto out_unlock_range;
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
}
- search_start = extent_map_end(em);
- free_extent_map(em);
}
+ kfree(pages);
+ return ret;
+}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
- if (i_done != page_cnt) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, (page_cnt - i_done) << PAGE_SHIFT, true);
- }
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list);
+ if (ret < 0)
+ goto out;
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
- set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state);
+ /* Reached the limit */
+ if (max_sectors && max_sectors == *sectors_defragged)
+ break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
- for (i = 0; i < i_done; i++) {
- clear_page_dirty_for_io(pages[i]);
- ClearPageChecked(pages[i]);
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len;
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
- return i_done;
-
-out_unlock_range:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
out:
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start, page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
return ret;
-
}
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file_ra_state *ra = NULL;
- unsigned long last_index;
+ unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
- u64 last_len = 0;
- u64 skip = 0;
- u64 defrag_end = 0;
- u64 newer_off = range->start;
- unsigned long i;
- unsigned long ra_index = 0;
- int ret;
- int defrag_count = 0;
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)SZ_128K - 1);
- struct page **pages = NULL;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
if (isize == 0)
return 0;
@@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (extent_thresh == 0)
extent_thresh = SZ_256K;
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len) - 1;
+ } else {
+ /* Defrag until file end */
+ last_byte = isize - 1;
+ }
+
/*
- * If we were not given a file, allocate a readahead context. As
+ * If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
- if (!file) {
+ if (!ra) {
+ ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
- } else {
- ra = &file->f_ra;
- }
-
- pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_ra;
- }
-
- /* find the last page to defrag */
- if (range->start + range->len > range->start) {
- last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_SHIFT;
- } else {
- last_index = (isize - 1) >> PAGE_SHIFT;
- }
-
- if (newer_than) {
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- /*
- * we always align our defrag to help keep
- * the extents in the file evenly spaced
- */
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else
- goto out_ra;
- } else {
- i = range->start >> PAGE_SHIFT;
}
- if (!max_to_defrag)
- max_to_defrag = last_index - i + 1;
-
- /*
- * make writeback starts from i, so the defrag range can be
- * written sequentially.
- */
- if (i < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = i;
-
- while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- /*
- * make sure we stop running if someone unmounts
- * the FS
- */
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- break;
- if (btrfs_defrag_cancelled(fs_info)) {
- btrfs_debug(fs_info, "defrag_file cancelled");
- ret = -EAGAIN;
- goto error;
- }
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
- if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- extent_thresh, &last_len, &skip,
- &defrag_end, do_compress)){
- unsigned long next;
- /*
- * the should_defrag function tells us how much to skip
- * bump our counter by the suggested amount
- */
- next = DIV_ROUND_UP(skip, PAGE_SIZE);
- i = max(i + 1, next);
- continue;
- }
+ while (cur < last_byte) {
+ u64 cluster_end;
- if (!newer_than) {
- cluster = (PAGE_ALIGN(defrag_end) >>
- PAGE_SHIFT) - i;
- cluster = min(cluster, max_cluster);
- } else {
- cluster = max_cluster;
- }
+ /* The cluster size 256K should always be page aligned */
+ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
- if (i + cluster > ra_index) {
- ra_index = max(i, ra_index);
- if (ra)
- page_cache_sync_readahead(inode->i_mapping, ra,
- file, ra_index, cluster);
- ra_index += cluster;
- }
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
- } else {
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ btrfs_inode_unlock(inode, 0);
+ break;
}
- if (ret < 0) {
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
btrfs_inode_unlock(inode, 0);
- goto out_ra;
+ break;
}
-
- defrag_count += ret;
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress,
+ &sectors_defragged, max_to_defrag);
btrfs_inode_unlock(inode, 0);
-
- if (newer_than) {
- if (newer_off == (u64)-1)
- break;
-
- if (ret > 0)
- i += ret;
-
- newer_off = max(newer_off + 1,
- (u64)i << PAGE_SHIFT);
-
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else {
- break;
- }
- } else {
- if (ret > 0) {
- i += ret;
- last_len += ret << PAGE_SHIFT;
- } else {
- i++;
- last_len = 0;
- }
- }
+ if (ret < 0)
+ break;
+ cur = cluster_end + 1;
}
- ret = defrag_count;
-error:
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
+ if (ra_allocated)
+ kfree(ra);
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
}
-
- if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- }
-
-out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
btrfs_inode_unlock(inode, 0);
}
- if (!file)
- kfree(ra);
- kfree(pages);
return ret;
}
@@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
@@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
goto out_finish;
}
@@ -2261,9 +2222,8 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = fault_in_pages_writeable(ubuf + sk_offset,
- *buf_size - sk_offset);
- if (ret)
+ ret = -EFAULT;
+ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
break;
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@ -3136,12 +3096,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- /* Subpage defrag will be supported in later commits */
- if (root->fs_info->sectorsize < PAGE_SIZE) {
- ret = -ENOTTY;
- goto out;
- }
-
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3130,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), file,
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -3220,6 +3174,7 @@ out:
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3186,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto err_drop;
+ goto out;
}
if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
+
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
- strcmp("cancel", vol_args->name) == 0)
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret)
- goto out;
- /* Exclusive operation is now claimed */
+ goto err_drop;
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
- else
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
btrfs_exclop_finish(fs_info);
@@ -3271,17 +3230,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
-out:
- kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
@@ -3293,32 +3254,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto out_drop_write;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- cancel = (strcmp("cancel", vol_args->name) == 0);
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
- kfree(vol_args);
-out_drop_write:
mnt_drop_write_file(file);
if (bdev)
blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
@@ -3379,22 +3346,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
- char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
+ args.devid = di_args->devid;
if (!btrfs_is_empty_uuid(di_args->uuid))
- s_uuid = di_args->uuid;
+ args.uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL);
-
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
ret = -ENODEV;
goto out;
@@ -4430,7 +4396,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_quota_rescan_args qsa = {0};
- int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4441,9 +4406,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
}
if (copy_to_user(arg, &qsa, sizeof(qsa)))
- ret = -EFAULT;
+ return -EFAULT;
- return ret;
+ return 0;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index a2e1f1f5c6e3..bbc45534ae9a 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_write(&eb->lock);
}
#else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 3dbe6eb5fda7..65cb0766e62d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -32,19 +32,19 @@
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
- * One segment represents at most one page of uncompressed data.
+ * One segment represents at most one sector of uncompressed data.
*
* 2.1 Segment header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size of the segment (not including the header).
- * Segment header never crosses page boundary, thus it's possible to
- * have at most 3 padding zeros at the end of the page.
+ * Segment header never crosses sector boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the sector.
*
* 2.2 Data Payload
- * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- * which is 4419 for a 4KiB page.
+ * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ * which is 4419 for a 4KiB sectorsize.
*
- * Example:
+ * Example with 4K sectorsize:
* Page 1:
* 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
* 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
@@ -112,170 +112,174 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+ size_t compressed_size,
+ struct page **out_pages,
+ u32 *cur_out,
+ const u32 sectorsize)
+{
+ u32 sector_bytes_left;
+ u32 orig_out;
+ struct page *cur_page;
+ char *kaddr;
+
+ /*
+ * We never allow a segment header crossing sector boundary, previous
+ * run should ensure we have enough space left inside the sector.
+ */
+ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+
+ kaddr = kmap(cur_page);
+ write_compress_length(kaddr + offset_in_page(*cur_out),
+ compressed_size);
+ *cur_out += LZO_LEN;
+
+ orig_out = *cur_out;
+
+ /* Copy compressed data */
+ while (*cur_out - orig_out < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ orig_out + compressed_size - *cur_out);
+
+ kunmap(cur_page);
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+ kaddr = kmap(cur_page);
+
+ memcpy(kaddr + offset_in_page(*cur_out),
+ compressed_data + *cur_out - orig_out, copy_len);
+
+ *cur_out += copy_len;
+ }
+
+ /*
+ * Check if we can fit the next segment header into the remaining space
+ * of the sector.
+ */
+ sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+ goto out;
+
+ /* The remaining size is not enough, pad it with zeros */
+ memset(kaddr + offset_in_page(*cur_out), 0,
+ sector_bytes_left);
+ *cur_out += sector_bytes_left;
+
+out:
+ kunmap(cur_page);
+ return 0;
+}
+
int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ struct page *page_in = NULL;
+ char *sizes_ptr;
int ret = 0;
- char *data_in;
- char *cpage_out, *sizes_ptr;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
- unsigned long bytes_left;
- unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- size_t in_len;
- size_t out_len;
- char *buf;
- unsigned long tot_in = 0;
- unsigned long tot_out = 0;
- unsigned long pg_bytes_left;
- unsigned long out_offset;
- unsigned long bytes;
+ /* Points to the file offset of input data */
+ u64 cur_in = start;
+ /* Points to the current output byte */
+ u32 cur_out = 0;
+ u32 len = *total_out;
*out_pages = 0;
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
-
/*
- * store the size of all chunks of compressed data in
- * the first 4 bytes
+ * Skip the header for now, we will later come back and write the total
+ * compressed size
*/
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = kmap(out_page);
- out_offset = LZO_LEN;
- tot_out = LZO_LEN;
- pages[0] = out_page;
- nr_pages = 1;
- pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
- /* compress at most one page of data each time */
- in_len = min(len, PAGE_SIZE);
- while (tot_in < len) {
- ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
- &out_len, workspace->mem);
- if (ret != LZO_E_OK) {
- pr_debug("BTRFS: lzo in loop returned %d\n",
- ret);
+ cur_out += LZO_LEN;
+ while (cur_in < start + len) {
+ char *data_in;
+ const u32 sectorsize_mask = sectorsize - 1;
+ u32 sector_off = (cur_in - start) & sectorsize_mask;
+ u32 in_len;
+ size_t out_len;
+
+ /* Get the input page first */
+ if (!page_in) {
+ page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ ASSERT(page_in);
+ }
+
+ /* Compress at most one sector of data each time */
+ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ ASSERT(in_len);
+ data_in = kmap(page_in);
+ ret = lzo1x_1_compress(data_in +
+ offset_in_page(cur_in), in_len,
+ workspace->cbuf, &out_len,
+ workspace->mem);
+ kunmap(page_in);
+ if (ret < 0) {
+ pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
goto out;
}
- /* store the size of this chunk of compressed data */
- write_compress_length(cpage_out + out_offset, out_len);
- tot_out += LZO_LEN;
- out_offset += LZO_LEN;
- pg_bytes_left -= LZO_LEN;
-
- tot_in += in_len;
- tot_out += out_len;
-
- /* copy bytes from the working buffer into the pages */
- buf = workspace->cbuf;
- while (out_len) {
- bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
- memcpy(cpage_out + out_offset, buf, bytes);
-
- out_len -= bytes;
- pg_bytes_left -= bytes;
- buf += bytes;
- out_offset += bytes;
-
- /*
- * we need another page for writing out.
- *
- * Note if there's less than 4 bytes left, we just
- * skip to a new page.
- */
- if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
- pg_bytes_left == 0) {
- if (pg_bytes_left) {
- memset(cpage_out + out_offset, 0,
- pg_bytes_left);
- tot_out += pg_bytes_left;
- }
-
- /* we're done, don't allocate new page */
- if (out_len == 0 && tot_in >= len)
- break;
-
- kunmap(out_page);
- if (nr_pages == nr_dest_pages) {
- out_page = NULL;
- ret = -E2BIG;
- goto out;
- }
-
- out_page = alloc_page(GFP_NOFS);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = kmap(out_page);
- pages[nr_pages++] = out_page;
-
- pg_bytes_left = PAGE_SIZE;
- out_offset = 0;
- }
- }
+ ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ pages, &cur_out, sectorsize);
+ if (ret < 0)
+ goto out;
+
+ cur_in += in_len;
- /* we're making it bigger, give up */
- if (tot_in > 8192 && tot_in < tot_out) {
+ /*
+ * Check if we're making it bigger after two sectors. And if
+ * it is so, give up.
+ */
+ if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
ret = -E2BIG;
goto out;
}
- /* we're all done */
- if (tot_in >= len)
- break;
-
- if (tot_out > max_out)
- break;
-
- bytes_left = len - tot_in;
- kunmap(in_page);
- put_page(in_page);
-
- start += PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
- in_len = min(bytes_left, PAGE_SIZE);
- }
-
- if (tot_out >= tot_in) {
- ret = -E2BIG;
- goto out;
+ /* Check if we have reached page boundary */
+ if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ put_page(page_in);
+ page_in = NULL;
+ }
}
- /* store the size of all chunks of compressed data */
+ /* Store the size of all chunks of compressed data */
sizes_ptr = kmap_local_page(pages[0]);
- write_compress_length(sizes_ptr, tot_out);
+ write_compress_length(sizes_ptr, cur_out);
kunmap_local(sizes_ptr);
ret = 0;
- *total_out = tot_out;
- *total_in = tot_in;
+ *total_out = cur_out;
+ *total_in = cur_in - start;
out:
- *out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
- put_page(in_page);
- }
-
+ *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -357,9 +361,10 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
- kaddr = kmap(cur_page);
ASSERT(cur_page);
+ kaddr = kmap(cur_page);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ kunmap(cur_page);
cur_in += LZO_LEN;
/* Copy the compressed segment payload into workspace */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8d268ca8aa7..0e239a4c3b26 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -60,8 +60,7 @@ enum btrfs_rbio_ops {
};
struct btrfs_raid_bio {
- struct btrfs_fs_info *fs_info;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
@@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+ btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->bbio->raid_map[0];
+ u64 num = rbio->bioc->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be
@@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
__remove_rbio_from_cache(rbio);
@@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
spin_lock(&rbio->bio_list_lock);
@@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->bbio->raid_map[0] !=
- cur->bbio->raid_map[0])
+ if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
- h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+ h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
continue;
spin_lock(&cur->bio_list_lock);
@@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
int keep_cache = 0;
bucket = rbio_bucket(rbio);
- h = rbio->fs_info->stripe_hash_table->table + bucket;
+ h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
@@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- btrfs_put_bbio(rbio->bbio);
+ btrfs_put_bioc(rbio->bioc);
kfree(rbio);
}
@@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *extra;
if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio)
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bbio->max_errors;
+ 0 : rbio->bioc->max_errors;
if (atomic_read(&rbio->error) > max_errors)
err = BLK_STS_IOERR;
@@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_bio *bbio,
+ struct btrfs_io_context *bioc,
u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
int num_pages = rbio_nr_pages(stripe_len, real_stripes);
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
@@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
- rbio->bbio = bbio;
- rbio->fs_info = fs_info;
+ rbio->bioc = bioc;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->real_stripes = real_stripes;
@@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
BUG();
@@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
u64 disk_start;
- stripe = &rbio->bbio->stripes[stripe_nr];
+ stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
@@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_io_bio(bio)->device = stripe->dev;
+ bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
int i = 0;
start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bbio->raid_map[0];
+ stripe_offset = start - rbio->bioc->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ bio->bi_iter = btrfs_bio(bio)->iter;
bio_for_each_segment(bvec, bio, iter) {
rbio->bio_pages[page_index + i] = bvec.bv_page;
@@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
@@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
}
- if (likely(!bbio->num_tgtdevs))
+ if (likely(!bioc->num_tgtdevs))
goto write_data;
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bbio->tgtdev_map[stripe])
+ if (!bioc->tgtdev_map[stripe])
continue;
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
@@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
}
ret = rbio_add_io_page(rbio, &bio_list, page,
- rbio->bbio->tgtdev_map[stripe],
+ rbio->bioc->tgtdev_map[stripe],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
{
u64 physical = bio->bi_iter.bi_sector;
int i;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
physical <<= 9;
- for (i = 0; i < rbio->bbio->num_stripes; i++) {
- stripe = &rbio->bbio->stripes[i];
+ for (i = 0; i < rbio->bioc->num_stripes; i++) {
+ stripe = &rbio->bioc->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
@@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
int i;
for (i = 0; i < rbio->nr_data; i++) {
- u64 stripe_start = rbio->bbio->raid_map[i];
+ u64 stripe_start = rbio->bioc->raid_map[i];
if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
@@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
/*
@@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bbio->raid_map[faila] ==
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
RAID5_P_STRIPE) {
err = BLK_STS_IOERR;
goto cleanup;
@@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
@@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
- if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+ if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
__raid_recover_end_io(rbio);
return 0;
} else {
@@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2116,22 +2114,22 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io)
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int ret;
if (generic_io) {
- ASSERT(bbio->mirror_num == mirror_num);
- btrfs_io_bio(bio)->mirror_num = mirror_num;
+ ASSERT(bioc->mirror_num == mirror_num);
+ btrfs_bio(bio)->mirror_num = mirror_num;
}
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return PTR_ERR(rbio);
}
@@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
btrfs_warn(fs_info,
- "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bbio->map_type);
+ (u64)bio->bi_iter.bi_size, bioc->map_type);
if (generic_io)
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(rbio);
return -EIO;
}
@@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
} else {
- btrfs_get_bbio(bbio);
+ btrfs_get_bioc(bioc);
}
/*
@@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
*
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
*/
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ u64 stripe_len, struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
/*
- * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+ * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
* to the end position, so this search can start from the first parity
* stripe.
*/
for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
- if (bbio->stripes[i].dev == scrub_dev) {
+ if (bioc->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
@@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
- * We have already increased bio_counter when getting bbio, record it
+ * We have already increased bio_counter when getting bioc, record it
* so we can free it at rbio_orig_end_io().
*/
rbio->generic_bio_cnt = 1;
@@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->bbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+ ASSERT(logical >= rbio->bioc->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
@@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
@@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
else
BUG();
- if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
}
@@ -2435,7 +2433,7 @@ writeback:
page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
ret = rbio_add_io_page(rbio, &bio_list, page,
- bbio->tgtdev_map[rbio->scrubp],
+ bioc->tgtdev_map[rbio->scrubp],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
@@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
*/
static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
* the data, so the capability of the repair is declined.
* (In the case of RAID5, we can not repair anything)
*/
- if (dfail > rbio->bbio->max_errors - 1)
+ if (dfail > rbio->bioc->max_errors - 1)
goto cleanup;
/*
@@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
@@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bbio, length);
+ rbio = alloc_rbio(fs_info, bioc, length);
if (IS_ERR(rbio))
return NULL;
@@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
/*
- * When we get bbio, we have already increased bio_counter, record it
+ * When we get bioc, we have already increased bio_counter, record it
* so we can free it at rbio_orig_end_io()
*/
rbio->generic_bio_cnt = 1;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2503485db859..72c00fc284b5 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical);
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+ u64 length);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 06713a8fe26b..eb96fdc3be25 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@ start_machine:
}
static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
- struct btrfs_bio *bbio)
+ struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
@@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
kref_init(&zone->refcnt);
zone->elems = 0;
zone->device = dev; /* our device always sits at index 0 */
- for (i = 0; i < bbio->num_stripes; ++i) {
+ for (i = 0; i < bioc->num_stripes; ++i) {
/* bounds have already been checked */
- zone->devs[i] = bbio->stripes[i].dev;
+ zone->devs[i] = bioc->stripes[i].dev;
}
- zone->ndevs = bbio->num_stripes;
+ zone->ndevs = bioc->num_stripes;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
@@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
int ret;
struct reada_extent *re = NULL;
struct reada_extent *re_exist = NULL;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
u64 length;
@@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
*/
length = fs_info->nodesize;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0);
- if (ret || !bbio || length < fs_info->nodesize)
+ &length, &bioc, 0);
+ if (ret || !bioc || length < fs_info->nodesize)
goto error;
- if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+ if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
btrfs_err(fs_info,
"readahead: more than %d copies not supported",
BTRFS_MAX_MIRRORS);
goto error;
}
- real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
for (nzones = 0; nzones < real_stripes; ++nzones) {
struct reada_zone *zone;
- dev = bbio->stripes[nzones].dev;
+ dev = bioc->stripes[nzones].dev;
/* cannot read ahead on missing device. */
if (!dev->bdev)
continue;
- zone = reada_find_zone(dev, logical, bbio);
+ zone = reada_find_zone(dev, logical, bioc);
if (!zone)
continue;
@@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!have_zone)
goto error;
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return re;
error:
@@ -488,7 +488,7 @@ error:
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
kfree(re);
return re_exist;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index d2062d5f71dd..e2b9f8616501 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.root;
+ ref_root = generic_ref->tree_ref.owning_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.ref_root;
+ ref_root = generic_ref->data_ref.owning_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 9b0814318e72..e0f93b357548 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
}
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- ClearPageChecked(page);
+ btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
out_unlock:
if (page) {
@@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
- int ret;
+ int ret = 0;
u64 i, tail_len, chunk_count;
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 914d403b4415..33a0ee7ac590 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -25,6 +25,7 @@
#include "backref.h"
#include "misc.h"
#include "subpage.h"
+#include "zoned.h"
/*
* Relocation overview
@@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1367,8 +1368,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
blocksize, path->nodes[level]->start);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1376,8 +1377,8 @@ again:
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+ true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1386,8 +1387,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1396,8 +1397,8 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
upper->eb->start);
- ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb));
+ btrfs_header_owner(upper->eb),
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
@@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
- /*
- * On a zoned filesystem, we cannot preallocate the file region.
- * Instead, we dirty and fiemap_write the region.
- */
- if (btrfs_is_zoned(inode->root->fs_info)) {
- struct btrfs_root *root = inode->root;
- struct btrfs_trans_handle *trans;
-
- end = cluster->end - offset + 1;
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- i_size_write(&inode->vfs_inode, end);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- return btrfs_end_transaction(trans);
- }
-
btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
@@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
- u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+ u64 start, u64 end, u64 block_start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
@@ -3084,7 +3063,6 @@ release_page:
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
@@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_extent_mapping(inode, cluster->start - offset,
+ ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out;
@@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
- if (btrfs_is_zoned(fs_info) && !ret)
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
- if (btrfs_is_zoned(trans->fs_info))
- flags &= ~BTRFS_INODE_PREALLOC;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->start,
rc->block_group->length);
+ ret = btrfs_zone_finish(rc->block_group);
+ WARN_ON(ret && ret != -EAGAIN);
+
while (1) {
int finishes_stage;
@@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
if (!rc)
return 0;
- BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
- root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 702dc5441f03..12ceb14a1141 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -39,10 +39,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
need_reset = 1;
}
if (need_reset) {
- memset(&item->generation_v2, 0,
- sizeof(*item) - offsetof(struct btrfs_root_item,
- generation_v2));
-
+ /* Clear all members from generation_v2 onwards. */
+ memset_startat(item, 0, generation_v2);
generate_random_guid(item->uuid);
}
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 088641ba7a8e..cf82ea6f54fb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -57,7 +57,7 @@ struct scrub_ctx;
struct scrub_recover {
refcount_t refs;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 map_length;
};
@@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
return spage->recover &&
- (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(recover->bbio);
+ btrfs_put_bioc(recover->bioc);
kfree(recover);
}
}
@@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other = sblocks_for_recheck + mirror_index;
} else {
struct scrub_recover *r = sblock_bad->pagev[0]->recover;
- int max_allowed = r->bbio->num_stripes -
- r->bbio->num_tgtdevs;
+ int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
@@ -1218,14 +1217,14 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
- return (int)bbio->num_stripes;
+ return (int)bioc->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
@@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
u64 flags = original_sblock->pagev[0]->flags;
u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
@@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
- bbio = NULL;
+ bioc = NULL;
/*
* With a length of sectorsize, each returned stripe represents
@@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < sublen) {
- btrfs_put_bbio(bbio);
+ logical, &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < sublen) {
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
- recover->bbio = bbio;
+ recover->bioc = bioc;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
- nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+ nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
@@ -1348,17 +1347,17 @@ leave_nomem:
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
- bbio->map_type,
- bbio->raid_map,
+ bioc->map_type,
+ bioc->raid_map,
mapped_length,
- bbio->num_stripes -
- bbio->num_tgtdevs,
+ bioc->num_stripes -
+ bioc->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bbio->stripes[stripe_index].physical +
+ spage->physical = bioc->stripes[stripe_index].physical +
stripe_offset;
- spage->dev = bbio->stripes[stripe_index].dev;
+ spage->dev = bioc->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
spage->physical_for_dev_replace =
@@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+ ret = raid56_parity_recover(bio, spage->recover->bioc,
spage->recover->map_length,
mirror_num, 0);
if (ret)
@@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
if (!first_page->dev->bdev)
goto out;
- bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
@@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
WARN_ON(!spage->page);
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
@@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
return -EIO;
}
- bio = btrfs_io_bio_alloc(1);
+ bio = btrfs_bio_alloc(1);
bio_set_dev(bio, spage_bad->dev->bdev);
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
@@ -1676,7 +1675,7 @@ again:
sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
sbio->bio = bio;
}
@@ -2102,7 +2101,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
sbio->bio = bio;
}
@@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
if (WARN_ON(!sctx->is_dev_replace ||
- !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
* there's a bug in scrub_remap_extent()/btrfs_map_block().
*/
- goto bbio_out;
+ goto bioc_out;
}
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc, length);
if (!rbio)
goto rbio_out;
@@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
@@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 length;
int ret;
@@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
- bio = btrfs_io_bio_alloc(0);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
- length, sparity->scrub_dev,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 flags;
int ret;
int slot;
@@ -3044,22 +3043,22 @@ again:
extent_len);
mapped_length = extent_len;
- bbio = NULL;
+ bioc = NULL;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bbio,
+ extent_logical, &mapped_length, &bioc,
0);
if (!ret) {
- if (!bbio || mapped_length < extent_len)
+ if (!bioc || mapped_length < extent_len)
ret = -EIO;
}
if (ret) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
goto out;
}
- extent_physical = bbio->stripes[0].physical;
- extent_mirror_num = bbio->mirror_num;
- extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
@@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
@@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
@@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
int *extent_mirror_num)
{
u64 mapped_length;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
mapped_length = extent_len;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bbio, 0);
- if (ret || !bbio || mapped_length < extent_len ||
- !bbio->stripes[0].dev->bdev) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len ||
+ !bioc->stripes[0].dev->bdev) {
+ btrfs_put_bioc(bioc);
return;
}
- *extent_physical = bbio->stripes[0].physical;
- *extent_mirror_num = bbio->mirror_num;
- *extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ *extent_physical = bioc->stripes[0].physical;
+ *extent_mirror_num = bioc->mirror_num;
+ *extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 72f9b865e847..040324d71118 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -84,6 +84,8 @@ struct send_ctx {
u64 total_send_size;
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+ /* Protocol version compatibility requested */
+ u32 proto;
struct btrfs_root *send_root;
struct btrfs_root *parent_root;
@@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
sctx->parent_root->root_key.objectid : 0));
}
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+ switch (sctx->proto) {
+ case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ default: return false;
+ }
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
@@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
if (S_ISDIR(sctx->cur_inode_mode)) {
ret = did_create_dir(sctx, sctx->cur_ino);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ else if (ret > 0)
+ return 0;
}
- ret = send_create_inode(sctx, sctx->cur_ino);
- if (ret < 0)
- goto out;
-
-out:
- return ret;
+ return send_create_inode(sctx, sctx->cur_ino);
}
struct recorded_ref {
@@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
+ if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+ if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+ ret = -EPROTO;
+ goto out;
+ }
+ /* Zero means "use the highest version" */
+ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+ } else {
+ sctx->proto = 1;
+ }
+
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
ret = -EBADF;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index de91488b7cd0..23bcefc84e49 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -48,6 +48,7 @@ struct btrfs_tlv_header {
enum btrfs_send_cmd {
BTRFS_SEND_C_UNSPEC,
+ /* Version 1 */
BTRFS_SEND_C_SUBVOL,
BTRFS_SEND_C_SNAPSHOT,
@@ -76,6 +77,12 @@ enum btrfs_send_cmd {
BTRFS_SEND_C_END,
BTRFS_SEND_C_UPDATE_EXTENT,
+ __BTRFS_SEND_C_MAX_V1,
+
+ /* Version 2 */
+ __BTRFS_SEND_C_MAX_V2,
+
+ /* End */
__BTRFS_SEND_C_MAX,
};
#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index aa5be0b24987..48d77f360a24 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
+ const bool aborted = BTRFS_FS_ERROR(fs_info);
trace_btrfs_fail_all_tickets(fs_info, space_info);
@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (ticket->steal &&
+ if (!aborted && ticket->steal &&
steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
remove_ticket(space_info, ticket);
- ticket->error = -ENOSPC;
+ if (aborted)
+ ticket->error = -EIO;
+ else
+ ticket->error = -ENOSPC;
wake_up(&ticket->wait);
/*
@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
* here to see if we can make progress with the next ticket in
* the list.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ if (!aborted)
+ btrfs_try_granting_tickets(fs_info, space_info);
}
return (tickets_id != space_info->tickets_id);
}
@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
} else {
flush_state = 0;
}
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+
}
spin_unlock(&space_info->lock);
}
+ return;
+
+aborted_fs:
+ maybe_fail_all_tickets(fs_info, space_info);
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
}
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index cb10e56ee31e..29bd8c7a7706 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,11 +63,41 @@
* This means a slightly higher tree locking latency.
*/
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+ unsigned int cur = 0;
+ unsigned int nr_bits;
+
+ ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+ nr_bits = PAGE_SIZE / sectorsize;
+ subpage_info->bitmap_nr_bits = nr_bits;
+
+ subpage_info->uptodate_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->error_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->dirty_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->writeback_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->ordered_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->checked_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->total_nr_bits = cur;
+}
+
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
- struct btrfs_subpage *subpage = NULL;
- int ret;
+ struct btrfs_subpage *subpage;
/*
* We have cases like a dummy extent buffer page, which is not mappped
@@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
*/
if (page->mapping)
ASSERT(PageLocked(page));
+
/* Either not subpage, or the page already has private attached */
if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
return 0;
- ret = btrfs_alloc_subpage(fs_info, &subpage, type);
- if (ret < 0)
- return ret;
+ subpage = btrfs_alloc_subpage(fs_info, type);
+ if (IS_ERR(subpage))
+ return PTR_ERR(subpage);
+
attach_page_private(page, subpage);
return 0;
}
@@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
btrfs_free_subpage(subpage);
}
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type)
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type)
{
- if (fs_info->sectorsize == PAGE_SIZE)
- return 0;
+ struct btrfs_subpage *ret;
+ unsigned int real_size;
+
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+ real_size = struct_size(ret, bitmaps,
+ BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ ret = kzalloc(real_size, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
- *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
- if (!*ret)
- return -ENOMEM;
- spin_lock_init(&(*ret)->lock);
+ spin_lock_init(&ret->lock);
if (type == BTRFS_SUBPAGE_METADATA) {
- atomic_set(&(*ret)->eb_refs, 0);
+ atomic_set(&ret->eb_refs, 0);
} else {
- atomic_set(&(*ret)->readers, 0);
- atomic_set(&(*ret)->writers, 0);
+ atomic_set(&ret->readers, 0);
+ atomic_set(&ret->writers, 0);
}
- return 0;
+ return ret;
}
void btrfs_free_subpage(struct btrfs_subpage *subpage)
@@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
u32 orig_len = *len;
*start = max_t(u64, page_offset(page), orig_start);
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
- orig_start + orig_len) - *start;
+ /*
+ * For certain call sites like btrfs_drop_pages(), we may have pages
+ * beyond the target range. In that case, just set @len to 0, subpage
+ * helpers can handle @len == 0 without any problem.
+ */
+ if (page_offset(page) >= orig_start + orig_len)
+ *len = 0;
+ else
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
@@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, page, start, len);
+ /*
+ * We have call sites passing @lock_page into
+ * extent_clear_unlock_delalloc() for compression path.
+ *
+ * This @locked_page is locked by plain lock_page(), thus its
+ * subpage::writers is 0. Handle them in a special way.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ return true;
+
ASSERT(atomic_read(&subpage->writers) >= nbits);
return atomic_sub_and_test(nbits, &subpage->writers);
}
@@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
unlock_page(page);
}
-/*
- * Convert the [start, start + len) range into a u16 bitmap
- *
- * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
- */
-static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
{
- const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
- const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned int found_zero;
- btrfs_subpage_assert(fs_info, page, start, len);
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ if (found_zero == start + nbits)
+ return true;
+ return false;
+}
- /*
- * Here nbits can be 16, thus can go beyond u16 range. We make the
- * first left shift to be calculate in unsigned long (at least u32),
- * then truncate the result to u16.
- */
- return (u16)(((1UL << nbits) - 1) << bit_start);
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ if (found_set == start + nbits)
+ return true;
+ return false;
}
+#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, page, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+ bitmap_test_range_all_set(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+ bitmap_test_range_all_zero(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap |= tmp;
- if (subpage->uptodate_bitmap == U16_MAX)
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
SetPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->uptodate_bitmap &= ~tmp;
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
ClearPageUptodate(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->error_bitmap &= ~tmp;
- if (subpage->error_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
ClearPageError(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
set_page_dirty(page);
}
@@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
unsigned long flags;
bool last = false;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->dirty_bitmap &= ~tmp;
- if (subpage->dirty_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
@@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
set_page_writeback(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->writeback_bitmap &= ~tmp;
- if (subpage->writeback_bitmap == 0) {
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
ASSERT(PageWriteback(page));
end_page_writeback(page);
}
@@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap |= tmp;
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
SetPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
- subpage->ordered_bitmap &= ~tmp;
- if (subpage->ordered_bitmap == 0)
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
ClearPageOrdered(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ SetPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ name, start, len); \
unsigned long flags; \
bool ret; \
\
spin_lock_irqsave(&subpage->lock, flags); \
- ret = ((subpage->name##_bitmap & tmp) == tmp); \
+ ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ len >> fs_info->sectorsize_bits); \
spin_unlock_irqrestore(&subpage->lock, flags); \
return ret; \
}
@@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
@@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(PagePrivate(page) && page->private);
- ASSERT(subpage->dirty_bitmap == 0);
+ ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ * It should not have any subpage::writers count.
+ * Can be unlocked by unlock_page().
+ * This is the most common locked page for __extent_writepage() called
+ * inside extent_write_cache_pages() or extent_write_full_page().
+ * Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ * There is only one caller, all pages except @locked_page for
+ * extent_write_locked_range().
+ * In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+
+ ASSERT(PageLocked(page));
+ /* For regular page size case, we just unlock the page */
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return unlock_page(page);
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers))
+ /* No writers, locked by plain lock_page() */
+ return unlock_page(page);
+
+ /* Have writers, use proper subpage helper to end it */
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 0120948f37a1..7accb5c40d33 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,38 @@
#include <linux/spinlock.h>
/*
- * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
- * is sufficient. Regular bitmap_* is not used due to size reasons.
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset /- error_offset /- dirty_offset
+ * | | |
+ * v v v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
*/
-#define BTRFS_SUBPAGE_BITMAP_SIZE 16
+struct btrfs_subpage_info {
+ /* Number of bits for each bitmap */
+ unsigned int bitmap_nr_bits;
+
+ /* Total number of bits for the whole bitmap */
+ unsigned int total_nr_bits;
+
+ /*
+ * *_start indicates where the bitmap starts, the length is always
+ * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+ */
+ unsigned int uptodate_offset;
+ unsigned int error_offset;
+ unsigned int dirty_offset;
+ unsigned int writeback_offset;
+ unsigned int ordered_offset;
+ unsigned int checked_offset;
+};
/*
* Structure to trace status of each sector inside a page, attached to
@@ -18,10 +46,6 @@
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- u16 uptodate_bitmap;
- u16 error_bitmap;
- u16 dirty_bitmap;
- u16 writeback_bitmap;
/*
* Both data and metadata needs to track how many readers are for the
* page.
@@ -38,14 +62,11 @@ struct btrfs_subpage {
* manages whether the subpage can be detached.
*/
atomic_t eb_refs;
- /* Structures only used by data */
- struct {
- atomic_t writers;
- /* Tracke pending ordered extent in this sector */
- u16 ordered_bitmap;
- };
+ /* Structures only used by data */
+ atomic_t writers;
};
+ unsigned long bitmaps[];
};
enum btrfs_subpage_type {
@@ -53,15 +74,15 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page);
/* Allocate additional data where page represents more than one sector */
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- struct btrfs_subpage **ret,
- enum btrfs_subpage_type type);
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
@@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 537d90bf5d84..a1c54a2c787c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_close_devices;
}
- bdev = fs_devices->latest_bdev;
+ bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
@@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
} else {
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
@@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_device *dev, *first_dev = NULL;
/*
- * Lightweight locking of the devices. We should not need
- * device_list_mutex here as we only read the device data and the list
- * is protected by RCU. Even if a device is deleted during the list
- * traversals, we'll get valid data, the freeing callback will wait at
- * least until the rcu_read_unlock.
+ * There should be always a valid pointer in latest_dev, it may be stale
+ * for a short moment in case it's being deleted but still valid until
+ * the end of RCU grace period.
*/
rcu_read_lock();
- list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
-
- if (first_dev)
- seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
- else
- WARN_ON(1);
+ seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
rcu_read_unlock();
+
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 25a6f587852b..f9eff3b0f77c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "0\n");
+ return sysfs_emit(buf, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+ btrfs_super_csum_name(i));
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
static ssize_t send_stream_version_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+ return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
@@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
int i;
for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i ? " " : ""), rescue_opts[i]);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_rescue_options,
@@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
/* 4K sector size is also supported with 64K page size */
if (PAGE_SIZE == SZ_64K)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+ ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
/* Only sectorsize == PAGE_SIZE is now supported */
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
}
@@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_bitmap_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- fs_info->discard_ctl.discard_extent_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.iops_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
}
static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.kbps_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(fs_info->discard_ctl.max_discard_size));
+ return sysfs_emit(buf, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
/*
@@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return sysfs_emit(buf, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%pU\n",
- fs_info->fs_devices->metadata_uuid);
+ return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
}
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
- return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(fs_info->csum_shash));
+ return sysfs_emit(buf, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
}
BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
str = "UNKNOWN\n";
break;
}
- return scnprintf(buf, PAGE_SIZE, "%s", str);
+ return sysfs_emit(buf, "%s", str);
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
@@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", fs_info->generation);
}
BTRFS_ATTR(, generation, btrfs_generation_show);
@@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
- ret = scnprintf(buf, PAGE_SIZE, "%d\n",
- READ_ONCE(fs_info->bg_reclaim_threshold));
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
return ret;
}
@@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
@@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
@@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
@@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
- return scnprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(device->scrub_speed_max));
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
}
static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
@@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
@@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
devid_kobj);
if (!device->dev_stats_valid)
- return scnprintf(buf, PAGE_SIZE, "invalid\n");
+ return sysfs_emit(buf, "invalid\n");
/*
* Print all at once so we get a snapshot of all values from the same
* time. Keep them in sync and in order of definition of
* btrfs_dev_stat_values.
*/
- return scnprintf(buf, PAGE_SIZE,
+ return sysfs_emit(buf,
"write_errs %d\n"
"read_errs %d\n"
"flush_errs %d\n"
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index df54cdfdc250..2a95f7224e18 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 73e96d505f4f..c2e72e7a8ff0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize)
}
set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize)
goto out_bits;
}
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
}
- if (end != (u64)-1) {
+ if (end != test_start + PAGE_SIZE - 1) {
test_err("did not return the proper end offset");
goto out_bits;
}
@@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
/* We unlocked it in the previous test */
lock_page(locked_page);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
/*
* Currently if we fail to find dirty pages in the delalloc range we
* will adjust max_bytes down to PAGE_SIZE and then re-search. If
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index c9874b12d337..cac89c388131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 14b9fdc8aaa9..1c3a1189c0bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
@@ -331,7 +331,7 @@ loop:
*/
kfree(cur_trans);
goto loop;
- } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ } else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
kfree(cur_trans);
return -EROFS;
@@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool do_chunk_alloc = false;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (TRANS_ABORTED(trans) ||
- test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+ if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
err = trans->aborted;
@@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* abort to prevent writing a new superblock that reflects a
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
- if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
goto cleanup_transaction;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b415c5ec03ea..8ab33caf016f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,7 +94,7 @@ enum {
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -207,7 +207,7 @@ again:
}
atomic_inc(&root->log_writers);
- if (ctx && !ctx->logging_new_name) {
+ if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
item_size = btrfs_item_size_nr(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ /* Our caller must have done a search for the key for us. */
+ ASSERT(path->nodes[0] != NULL);
+
+ /*
+ * And the slot must point to the exact key or the slot where the key
+ * should be at (the first item with a key greater than 'key')
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ ret = btrfs_comp_cpu_keys(&found_key, key);
+ ASSERT(ret >= 0);
+ } else {
+ ret = 1;
+ }
if (ret == 0) {
char *src_copy;
@@ -585,6 +583,36 @@ no_copy:
}
/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
@@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -893,11 +921,11 @@ out:
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
+ struct btrfs_root *root = dir->root;
struct inode *inode;
char *name;
int name_len;
@@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+ ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
name_len);
if (ret)
goto out;
@@ -1091,7 +1119,7 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root, dir, inode,
+ ret = btrfs_unlink_inode(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
@@ -1162,7 +1190,7 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root,
+ ret = btrfs_unlink_inode(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
@@ -1192,7 +1220,7 @@ next:
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1204,7 +1232,7 @@ next:
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1324,7 +1352,7 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
@@ -1385,10 +1413,11 @@ out:
return ret;
}
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode, const char *name,
int namelen, u64 ref_index)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_dir_item *dir_item;
struct btrfs_key key;
struct btrfs_path *path;
@@ -1422,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = -ENOENT;
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
name, namelen);
if (ret)
goto out;
@@ -1568,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
- ret = btrfs_unlink_inode(trans, root,
+ ret = btrfs_unlink_inode(trans,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
@@ -1584,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = add_link(trans, root, dir, inode, name, namelen,
+ ret = add_link(trans, dir, inode, name, namelen,
ref_index);
if (ret)
goto out;
@@ -2021,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
if (!exists)
goto out;
- ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
+ ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
if (ret)
goto out;
@@ -2251,13 +2280,13 @@ out:
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
int ret;
struct extent_buffer *eb;
int slot;
@@ -2318,7 +2347,7 @@ again:
}
inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
ret = btrfs_run_delayed_items(trans);
@@ -2500,7 +2529,9 @@ again:
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -2529,7 +2560,7 @@ again:
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
+ ret = check_item_in_log(trans, log, path,
log_path, dir,
&found_key);
if (ret)
@@ -3037,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root)
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- if (!ctx)
- return;
-
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
@@ -3328,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* writing the super here would result in transid mismatches. If there
* is an error here just bail.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EIO;
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
@@ -3452,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
if (inode->logged_trans == trans->transid)
return true;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return false;
+
/*
* The inode's logged_trans is always 0 when we load it (because it is
* not persisted in the inode item or elsewhere). So if it is 0, the
@@ -3490,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index)
{
struct btrfs_root *log;
struct btrfs_dir_item *di;
@@ -3503,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
if (!inode_logged(trans, dir))
- return 0;
+ return;
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
mutex_lock(&dir->log_mutex);
@@ -3555,48 +3586,36 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (err == -ENOSPC) {
+ if (err < 0)
btrfs_set_log_full_commit(trans);
- err = 0;
- } else if (err < 0) {
- btrfs_abort_transaction(trans, err);
- }
-
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
u64 index;
int ret;
if (!inode_logged(trans, inode))
- return 0;
+ return;
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
-
- return ret;
}
/*
@@ -3632,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ char *ins_data = NULL;
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size_nr(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc(count * sizeof(u32) +
+ count * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size_nr(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ int key_type,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src = path->nodes[0];
+ const int nritems = btrfs_header_nritems(src);
+ const u64 ino = btrfs_ino(inode);
+ const bool inode_logged_before = inode_logged(trans, inode);
+ u64 last_logged_key_offset;
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ last_logged_key_offset = inode->last_dir_item_offset;
+ else
+ last_logged_key_offset = inode->last_dir_index_offset;
+
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != key_type) {
+ last_found = true;
+ break;
+ }
+
+ ctx->last_dir_item_offset = key.offset;
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if ((btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (!inode_logged_before)
+ goto add_to_batch;
+
+ /*
+ * If we were logged before and have logged dir items, we can skip
+ * checking if any item with a key offset larger than the last one
+ * we logged is in the log tree, saving time and avoiding adding
+ * contention on the log tree.
+ */
+ if (key.offset > last_logged_key_offset)
+ goto add_to_batch;
+ /*
+ * Check if the key was already logged before. If not we can add
+ * it to a batch for bulk insertion.
+ */
+ ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ btrfs_release_path(dst_path);
+ goto add_to_batch;
+ }
+
+ /*
+ * Item exists in the log. Overwrite the item in the log if it
+ * has different content or do nothing if it has exactly the same
+ * content. And then flush the current batch if any - do it after
+ * overwriting the current item, or we would deadlock otherwise,
+ * since we are holding a path for the existing item.
+ */
+ ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+ if (ret < 0)
+ return ret;
+
+ if (batch_size > 0) {
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ batch_size = 0;
+ }
+ continue;
+add_to_batch:
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ int ret;
+
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
int err = 0;
int ret;
- int i;
- int nritems;
u64 first_offset = min_offset;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = min_offset;
@@ -3730,62 +3949,14 @@ search:
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
-
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
-
- if (need_resched()) {
- btrfs_release_path(path);
- cond_resched();
- goto search;
- }
-
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
+ ret = process_dir_items_leaf(trans, inode, path, dst_path,
+ key_type, ctx);
+ if (ret != 0) {
+ if (ret < 0)
err = ret;
- goto done;
- }
-
- /*
- * We must make sure that when we log a directory entry,
- * the corresponding inode, after log replay, has a
- * matching link count. For example:
- *
- * touch foo
- * mkdir mydir
- * sync
- * ln foo mydir/bar
- * xfs_io -c "fsync" mydir
- * <crash>
- * <mount fs and log replay>
- *
- * Would result in a fsync log that when replayed, our
- * file inode would have a link count of 1, but we get
- * two directory entries pointing to the same inode.
- * After removing one of the names, it would not be
- * possible to remove the other name, which resulted
- * always in stale file handle errors, and would not
- * be possible to rmdir the parent directory, since
- * its i_size could never decrement to the value
- * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
- */
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(src, di, &tmp);
- if (ctx &&
- (btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- tmp.type != BTRFS_ROOT_ITEM_KEY)
- ctx->log_new_dentries = true;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
@@ -3799,21 +3970,26 @@ search:
err = ret;
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != key_type) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
- &tmp);
+ &min_key);
if (ret)
err = ret;
else
- last_offset = tmp.offset;
+ last_offset = min_key.offset;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
@@ -3846,7 +4022,7 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
@@ -3856,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
int ret;
int key_type = BTRFS_DIR_ITEM_KEY;
+ /*
+ * If this is the first time we are being logged in the current
+ * transaction, or we were logged before but the inode was evicted and
+ * reloaded later, in which case its logged_trans is 0, reset the values
+ * of the last logged key offsets. Note that we don't use the helper
+ * function inode_logged() here - that is because the function returns
+ * true after an inode eviction, assuming the worst case as it can not
+ * know for sure if the inode was logged before. So we can not skip key
+ * searches in the case the inode was evicted, because it may not have
+ * been logged in this transaction and may have been logged in a past
+ * transaction, so we need to reset the last dir item and index offsets
+ * to (u64)-1.
+ */
+ if (inode->logged_trans != trans->transid) {
+ inode->last_dir_item_offset = (u64)-1;
+ inode->last_dir_index_offset = (u64)-1;
+ }
again:
min_key = 0;
max_key = 0;
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ ctx->last_dir_item_offset = inode->last_dir_item_offset;
+ else
+ ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path, key_type,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -3870,8 +4068,11 @@ again:
}
if (key_type == BTRFS_DIR_ITEM_KEY) {
+ inode->last_dir_item_offset = ctx->last_dir_item_offset;
key_type = BTRFS_DIR_INDEX_KEY;
goto again;
+ } else {
+ inode->last_dir_index_offset = ctx->last_dir_item_offset;
}
return 0;
}
@@ -3882,17 +4083,21 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ if (!inode_logged(trans, inode))
+ return 0;
+
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -3909,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
@@ -3934,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ int ret;
+
+ do {
+ ret = btrfs_truncate_inode_items(trans, log_root, inode,
+ new_size, min_type, NULL);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
@@ -4106,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
+ struct btrfs_item_batch batch;
char *ins_data;
int i;
struct list_head ordered_sums;
@@ -4120,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = nr;
for (i = 0; i < nr; i++) {
ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ batch.total_data_size += ins_sizes[i];
btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
}
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
if (ret) {
kfree(ins_data);
return ret;
@@ -4338,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
static int log_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_root *root,
+ struct btrfs_inode *inode,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_drop_extents_args drop_args = { 0 };
- struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct btrfs_map_token token;
@@ -4357,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- drop_args.path = path;
- drop_args.start = em->start;
- drop_args.end = em->start + em->len;
- drop_args.replace_extent = true;
- drop_args.extent_item_size = sizeof(*fi);
- ret = btrfs_drop_extents(trans, log, inode, &drop_args);
- if (ret)
- return ret;
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (inode_logged(trans, inode)) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
@@ -4522,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
- do {
- ret = btrfs_truncate_inode_items(trans,
- root->log_root,
- inode, truncate_offset,
- BTRFS_EXTENT_DATA_KEY,
- NULL);
- } while (ret == -EAGAIN);
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ truncate_offset,
+ BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
@@ -4555,7 +4787,6 @@ out:
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
@@ -4620,7 +4851,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, ctx);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -4709,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path)
{
+ struct btrfs_root *root = inode->root;
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4787,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -5067,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
- ret = btrfs_log_inode(trans, root,
+ ret = btrfs_log_inode(trans,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
ctx);
@@ -5127,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, ctx);
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5239,7 +5469,7 @@ again:
&other_ino, &other_parent);
if (ret < 0) {
return ret;
- } else if (ret > 0 && ctx &&
+ } else if (ret > 0 &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
if (ins_nr > 0) {
ins_nr++;
@@ -5339,7 +5569,7 @@ next_key:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -5347,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log = inode->root->log_root;
int err = 0;
int ret = 0;
bool fast_search = false;
@@ -5389,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* Only run delayed items if we are a directory. We want to make sure
* all directory indexes hit the fs/subvolume tree so we can find them
* and figure out which index ranges have to be logged.
- *
- * Otherwise commit the delayed inode only if the full sync flag is set,
- * as we want to make sure an up to date version is in the subvolume
- * tree so copy_inode_items_to_log() / copy_items() can find it and copy
- * it to the log tree. For a non full sync, we always log the inode item
- * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode))
- ret = btrfs_commit_inode_delayed_items(trans, inode);
- else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- ret = btrfs_commit_inode_delayed_inode(inode);
-
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ err = btrfs_commit_inode_delayed_items(trans, inode);
+ if (err)
+ goto out;
}
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5443,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ ret = drop_inode_items(trans, log, path, inode, max_key_type);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5467,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- while(1) {
- ret = btrfs_truncate_inode_items(trans,
- log, inode, 0, 0, NULL);
- if (ret != -EAGAIN)
- break;
- }
+ if (inode_logged(trans, inode))
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
@@ -5487,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
@@ -5511,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
if (err)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
+ err = btrfs_log_holes(trans, inode, path);
if (err)
goto out_unlock;
}
@@ -5538,16 +5754,14 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
+ err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
if (err)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx);
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5562,59 +5776,52 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path,
- ctx);
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
/*
- * If we are logging that an ancestor inode exists as part of logging a
- * new name from a link or rename operation, don't mark the inode as
- * logged - otherwise if an explicit fsync is made against an ancestor,
- * the fsync considers the inode in the log and doesn't sync the log,
- * resulting in the ancestor missing after a power failure unless the
- * log was synced as part of an fsync against any other unrelated inode.
- * So keep it simple for this case and just don't flag the ancestors as
- * logged.
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
*/
- if (!ctx ||
- !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
- &inode->vfs_inode != ctx->inode)) {
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- /*
- * Don't update last_log_commit if we logged that an inode exists.
- * We do this for two reasons:
- *
- * 1) We might have had buffered writes to this inode that were
- * flushed and had their ordered extents completed in this
- * transaction, but we did not previously log the inode with
- * LOG_INODE_ALL. Later the inode was evicted and after that
- * it was loaded again and this LOG_INODE_EXISTS log operation
- * happened. We must make sure that if an explicit fsync against
- * the inode is performed later, it logs the new extents, an
- * updated inode item, etc, and syncs the log. The same logic
- * applies to direct IO writes instead of buffered writes.
- *
- * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
- * is logged with an i_size of 0 or whatever value was logged
- * before. If later the i_size of the inode is increased by a
- * truncate operation, the log is synced through an fsync of
- * some other inode and then finally an explicit fsync against
- * this inode is made, we must make sure this fsync logs the
- * inode with the new i_size, the hole between old i_size and
- * the new i_size, and syncs the log.
- */
- if (inode_only != LOG_INODE_EXISTS)
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
- }
+ if (inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
-
+out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
return err;
@@ -5714,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_dir_list *dir_elem;
int ret = 0;
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5790,7 +6005,7 @@ process_leaf:
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
btrfs_add_delayed_iput(di_inode);
if (ret)
@@ -5934,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
continue;
}
- if (ctx)
- ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
- if (!ret && ctx && ctx->log_new_dentries)
+ if (!ret && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
@@ -5984,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >= trans->transid &&
need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
@@ -6039,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation >= trans->transid &&
need_log_inode(trans, inode)) {
- ret = btrfs_log_inode(trans, root, inode,
+ ret = btrfs_log_inode(trans, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
break;
@@ -6182,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6199,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+ if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
log_dentries = true;
/*
@@ -6325,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to pin buffers while recovering log root tree.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6339,8 +6552,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -6357,8 +6569,7 @@ again:
log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6386,8 +6597,7 @@ again:
if (!ret)
goto next;
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read target root for tree log recovery.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6395,14 +6605,15 @@ again:
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
if (ret)
/* The loop needs to continue due to the root refs */
- btrfs_handle_fs_error(fs_info, ret,
- "failed to record the log root in transaction");
+ btrfs_abort_transaction(trans, ret);
else
ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6419,6 +6630,8 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 731bd9c029f5..f6811c3df38a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ /* Tracks the last logged dir item/index key offset. */
+ u64 last_dir_item_offset;
struct inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
@@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2ec3b8ac8fa3..61ac57bcbf1a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map);
/*
@@ -508,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
if (flush)
- filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ sync_blockdev(*bdev);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
@@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
+ struct btrfs_dev_lookup_args args = {
+ .devid = devid,
+ .uuid = disk_super->dev_item.uuid,
+ };
+
mutex_lock(&fs_devices->device_list_mutex);
- device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL);
+ device = btrfs_find_device(fs_devices, &args);
/*
* If this disk has been pulled into an fs devices created by
@@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
__btrfs_free_extra_devids(seed_dev, &latest_dev);
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
mutex_unlock(&uuid_mutex);
}
@@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
if (device->bdev) {
@@ -1222,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1286,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
pgoff_t index;
/* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
@@ -1843,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1882,18 +1891,22 @@ out:
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
*/
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
{
- struct inode *inode = bdev->bd_inode;
+ struct path path;
struct timespec64 now;
+ int ret;
- /* Shouldn't happen but just in case. */
- if (!inode)
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
return;
- now = current_time(inode);
- generic_update_time(inode, &now, S_MTIME | S_CTIME);
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -1917,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1986,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
}
/*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
@@ -2005,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
- if (fs_info->fs_devices->latest_bdev == device->bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
+ if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ fs_info->fs_devices->latest_dev = next_device;
}
/*
@@ -2069,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(bdev);
+ update_dev_time(device_path);
}
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid, struct block_device **bdev, fmode_t *mode)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2081,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
-
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
-
- if (IS_ERR(device)) {
- if (PTR_ERR(device) == -ENOENT &&
- device_path && strcmp(device_path, "missing") == 0)
+ device = btrfs_find_device(fs_info->fs_devices, args);
+ if (!device) {
+ if (args->missing)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
- ret = PTR_ERR(device);
+ ret = -ENOENT;
goto out;
}
@@ -2126,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2159,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
- * its own fs_devices listed under the fs_devices->seed.
+ * its own fs_devices listed under the fs_devices->seed_list.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
@@ -2210,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
synchronize_rcu();
btrfs_free_device(device);
- if (cur_devices->open_devices == 0) {
+ /*
+ * This can happen if cur_devices is the private seed devices list. We
+ * cannot call close_fs_devices() here because it expects the uuid_mutex
+ * to be held, but in fact we don't need that for the private
+ * seed_devices, we can simply decrement cur_devices->opened and then
+ * remove it from our list and free the fs_devices.
+ */
+ if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- close_fs_devices(cur_devices);
+ ASSERT(cur_devices->opened == 1);
+ cur_devices->opened--;
free_fs_devices(cur_devices);
}
out:
- mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -2305,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * The update_dev_time() with in btrfs_scratch_superblocks()
- * may lead to a call to btrfs_show_devname() which will try
- * to hold device_list_mutex. And here this device
- * is already out of device list, so we don't have to hold
- * the device_list_mutex lock.
- */
btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
tgtdev->name->str);
@@ -2320,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
btrfs_free_device(tgtdev);
}
-static struct btrfs_device *btrfs_find_device_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info: the filesystem
+ * @args: the args to populate
+ * @path: the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks. The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path)
{
- int ret = 0;
struct btrfs_super_block *disk_super;
- u64 devid;
- u8 *dev_uuid;
struct block_device *bdev;
- struct btrfs_device *device;
+ int ret;
- ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &disk_super);
- if (ret)
- return ERR_PTR(ret);
+ if (!path || !path[0])
+ return -EINVAL;
+ if (!strcmp(path, "missing")) {
+ args->missing = true;
+ return 0;
+ }
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
+ args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ if (!args->uuid || !args->fsid) {
+ btrfs_put_dev_args_from_path(args);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ &bdev, &disk_super);
+ if (ret)
+ return ret;
+ args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid);
+ memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
else
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid);
-
+ memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- if (!device)
- device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return device;
+ return 0;
}
/*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
*/
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+ kfree(args->uuid);
+ kfree(args->fsid);
+ args->uuid = NULL;
+ args->fsid = NULL;
+}
+
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *device;
+ int ret;
if (devid) {
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
- if (!device_path || !device_path[0])
- return ERR_PTR(-EINVAL);
-
- if (strcmp(device_path, "missing") == 0) {
- /* Find first missing device */
- list_for_each_entry(device, &fs_info->fs_devices->devices,
- dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state) && !device->bdev)
- return device;
- }
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ if (ret)
+ return ERR_PTR(ret);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ btrfs_put_dev_args_from_path(&args);
+ if (!device)
return ERR_PTR(-ENOENT);
- }
-
- return btrfs_find_device_by_path(fs_info, device_path);
+ return device;
}
/*
@@ -2459,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
@@ -2468,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- u64 devid;
int ret;
path = btrfs_alloc_path();
@@ -2480,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2505,13 +2551,14 @@ next_slot:
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
- devid = btrfs_device_id(leaf, dev_item);
+ args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2610,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- fs_info->sectorsize);
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -2627,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
+ btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ device);
}
device->fs_devices = fs_devices;
@@ -2733,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_forget_devices(device_path);
/* Update ctime/mtime for blkid or udev */
- update_dev_time(bdev);
+ update_dev_time(device_path);
return ret;
@@ -2826,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2854,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3096,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
- sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -4889,8 +4943,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4973,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
}
/*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
* Wraps needed parameters.
*/
struct alloc_chunk_ctl {
@@ -5377,7 +5433,7 @@ error_del_extent:
return block_group;
}
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -5578,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ meta_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ sys_bg = btrfs_create_chunk(trans, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -5597,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
return btrfs_raid_array[index].tolerated_failures;
}
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- int readonly = 0;
int miss_ndevs = 0;
int i;
+ bool ret = true;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
- return 1;
+ return false;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
@@ -5618,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
- readonly = 1;
+ ret = false;
goto end;
}
}
/*
- * If the number of missing devices is larger than max errors,
- * we can not write the data into that chunk successfully, so
- * set it readonly.
+ * If the number of missing devices is larger than max errors, we can
+ * not write the data into that chunk successfully.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
- readonly = 1;
+ ret = false;
end:
free_extent_map(em);
- return readonly;
+ return ret;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5795,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
{
int i;
int again = 1;
@@ -5804,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
/* Swap if parity is on a smaller index */
- if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
- swap(bbio->stripes[i], bbio->stripes[i + 1]);
- swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+ if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
again = 1;
}
}
}
}
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ int total_stripes,
+ int real_stripes)
{
- struct btrfs_bio *bbio = kzalloc(
- /* the size of the btrfs_bio */
- sizeof(struct btrfs_bio) +
- /* plus the variable array for the stripes */
- sizeof(struct btrfs_bio_stripe) * (total_stripes) +
- /* plus the variable array for the tgt dev */
+ struct btrfs_io_context *bioc = kzalloc(
+ /* The size of btrfs_io_context */
+ sizeof(struct btrfs_io_context) +
+ /* Plus the variable array for the stripes */
+ sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ /* Plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
- * plus the raid_map, which includes both the tgt dev
- * and the stripes
+ * Plus the raid_map, which includes both the tgt dev
+ * and the stripes.
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bbio->error, 0);
- refcount_set(&bbio->refs, 1);
+ atomic_set(&bioc->error, 0);
+ refcount_set(&bioc->refs, 1);
- bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
- bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+ bioc->fs_info = fs_info;
+ bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
- return bbio;
+ return bioc;
}
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
{
- WARN_ON(!refcount_read(&bbio->refs));
- refcount_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bioc->refs));
+ refcount_inc(&bioc->refs);
}
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
{
- if (!bbio)
+ if (!bioc)
return;
- if (refcount_dec_and_test(&bbio->refs))
- kfree(bbio);
+ if (refcount_dec_and_test(&bioc->refs))
+ kfree(bioc);
}
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@ -5859,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5882,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
int ret = 0;
int i;
- /* discard always return a bbio */
- ASSERT(bbio_ret);
+ /* Discard always returns a bioc. */
+ ASSERT(bioc_ret);
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
@@ -5946,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
&stripe_index);
}
- bbio = alloc_btrfs_bio(num_stripes, 0);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
+ bioc->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
+ bioc->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
+ bioc->stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -5976,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
+ bioc->stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
+ bioc->stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bbio->stripes[i].length = length;
+ bioc->stripes[i].length = length;
}
stripe_index++;
@@ -5998,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
out:
free_extent_map(em);
return ret;
@@ -6024,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
@@ -6033,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bbio, 0, 0);
+ logical, &length, &bioc, 0, 0);
if (ret) {
- ASSERT(bbio == NULL);
+ ASSERT(bioc == NULL);
return ret;
}
- num_stripes = bbio->num_stripes;
+ num_stripes = bioc->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return -EIO;
}
@@ -6056,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid != srcdev_devid)
+ if (bioc->stripes[i].dev->devid != srcdev_devid)
continue;
/*
@@ -6064,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* mirror with the lowest physical address
*/
if (found &&
- physical_of_found <= bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
ASSERT(found);
if (!found)
@@ -6103,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
struct btrfs_dev_replace *dev_replace,
u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
- struct btrfs_bio *bbio = *bbio_ret;
+ struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
@@ -6138,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
+ struct btrfs_io_stripe *new =
+ bioc->stripes + index_where_to_add;
+ struct btrfs_io_stripe *old =
+ bioc->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
+ bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
@@ -6168,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
}
if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_io_stripe *tgtdev_stripe =
+ bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
+ bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ bioc->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
@@ -6200,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
- bbio->num_tgtdevs = tgtdev_indexes;
- *bbio_ret = bbio;
+ bioc->num_tgtdevs = tgtdev_indexes;
+ *bioc_ret = bioc;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -6304,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
@@ -6319,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
@@ -6328,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
- ASSERT(bbio_ret);
+ ASSERT(bioc_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, *length);
@@ -6472,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
- bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
- if (!bbio) {
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ bioc->stripes[i].physical = map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ bioc->stripes[i].dev = map->stripes[stripe_index].dev;
stripe_index++;
}
- /* build raid_map */
+ /* Build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
@@ -6497,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
- bbio->raid_map[(i+rot) % num_stripes] =
+ bioc->raid_map[(i + rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
- bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bbio->raid_map[(i+rot+1) % num_stripes] =
+ bioc->raid_map[(i + rot + 1) % num_stripes] =
RAID6_Q_STRIPE;
- sort_parity_stripes(bbio, num_stripes);
+ sort_parity_stripes(bioc, num_stripes);
}
if (need_full_stripe(op))
@@ -6513,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
&num_stripes, &max_errors);
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
+ bioc->max_errors = max_errors;
+ bioc->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6530,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
- bbio->stripes[0].dev = dev_replace->tgtdev;
- bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
- bbio->mirror_num = map->num_stripes + 1;
+ bioc->stripes[0].dev = dev_replace->tgtdev;
+ bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bioc->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing) {
@@ -6546,43 +6600,43 @@ out:
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num)
+ struct btrfs_io_context **bioc_ret, int mirror_num)
{
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ length, bioc_ret);
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
{
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
+ bio->bi_private = bioc->private;
+ bio->bi_end_io = bioc->end_io;
bio_endio(bio);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
}
static void btrfs_end_bio(struct bio *bio)
{
- struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_io_context *bioc = bio->bi_private;
int is_orig_bio = 0;
if (bio->bi_status) {
- atomic_inc(&bbio->error);
+ atomic_inc(&bioc->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+ struct btrfs_device *dev = btrfs_bio(bio)->device;
ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -6597,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio)
}
}
- if (bio == bbio->orig_bio)
+ if (bio == bioc->orig_bio)
is_orig_bio = 1;
- btrfs_bio_counter_dec(bbio->fs_info);
+ btrfs_bio_counter_dec(bioc->fs_info);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
- bio = bbio->orig_bio;
+ bio = bioc->orig_bio;
}
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
- if (atomic_read(&bbio->error) > bbio->max_errors) {
+ if (atomic_read(&bioc->error) > bioc->max_errors) {
bio->bi_status = BLK_STS_IOERR;
} else {
/*
@@ -6622,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio)
bio->bi_status = BLK_STS_OK;
}
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
u64 physical, struct btrfs_device *dev)
{
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
- bio->bi_private = bbio;
- btrfs_io_bio(bio)->device = dev;
+ bio->bi_private = bioc;
+ btrfs_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
@@ -6663,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfsic_submit_bio(bio);
}
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
{
- atomic_inc(&bbio->error);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ atomic_inc(&bioc->error);
+ if (atomic_dec_and_test(&bioc->stripes_pending)) {
/* Should be the original bio. */
- WARN_ON(bio != bbio->orig_bio);
+ WARN_ON(bio != bioc->orig_bio);
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ btrfs_bio(bio)->mirror_num = bioc->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bbio->error) > bbio->max_errors)
+ if (atomic_read(&bioc->error) > bioc->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
- btrfs_end_bbio(bbio, bio);
+ btrfs_end_bioc(bioc, bio);
}
}
@@ -6691,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int ret;
int dev_nr;
int total_devs;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
length = bio->bi_iter.bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bbio, mirror_num, 1);
+ &map_length, &bioc, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
- total_devs = bbio->num_stripes;
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- bbio->fs_info = fs_info;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+ total_devs = bioc->num_stripes;
+ bioc->orig_bio = first_bio;
+ bioc->private = first_bio->bi_private;
+ bioc->end_io = first_bio->bi_end_io;
+ atomic_set(&bioc->stripes_pending, bioc->num_stripes);
- if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(fs_info, bio, bbio,
- map_length);
+ ret = raid56_parity_write(bio, bioc, map_length);
} else {
- ret = raid56_parity_recover(fs_info, bio, bbio,
- map_length, mirror_num, 1);
+ ret = raid56_parity_recover(bio, bioc, map_length,
+ mirror_num, 1);
}
btrfs_bio_counter_dec(fs_info);
@@ -6735,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bbio->stripes[dev_nr].dev;
+ dev = bioc->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
(btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bbio_error(bbio, first_bio, logical);
+ bioc_error(bioc, first_bio, logical);
continue;
}
@@ -6749,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+ submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
}
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_fs_devices *fs_devices)
+{
+ if (args->fsid == NULL)
+ return true;
+ if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ return true;
+ return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_device *device)
+{
+ ASSERT((args->devid != (u64)-1) || args->missing);
+
+ if ((args->devid != (u64)-1) && device->devid != args->devid)
+ return false;
+ if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ return false;
+ if (!args->missing)
+ return true;
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+}
+
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
@@ -6762,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*/
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
- if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ if (dev_args_match_fs_devices(args, fs_devices)) {
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
+ if (dev_args_match_device(args, device))
return device;
}
}
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- if (!fsid ||
- !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &seed_devs->devices,
- dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
- return device;
- }
+ if (!dev_args_match_fs_devices(args, seed_devs))
+ continue;
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
}
}
@@ -6952,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
@@ -7029,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ args.devid = devid;
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL);
+ args.uuid = uuid;
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -7151,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
@@ -7159,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = btrfs_device_id(leaf, dev_item);
+ devid = args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -7171,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7236,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf,
fill_device_from_item(leaf, dev_item, device);
if (device->bdev) {
- u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
if (device->total_bytes > max_total_bytes) {
btrfs_err(fs_info,
@@ -7841,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+ args.devid = stats->devid;
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7922,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -7977,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device boundary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2183361db614..3b8130680749 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -236,17 +236,40 @@ struct btrfs_fs_devices {
bool fsid_change;
struct list_head fs_list;
+ /*
+ * Number of devices under this fsid including missing and
+ * replace-target device and excludes seed devices.
+ */
u64 num_devices;
+
+ /*
+ * The number of devices that successfully opened, including
+ * replace-target, excludes seed devices.
+ */
u64 open_devices;
+
+ /* The number of devices that are under the chunk allocation list. */
u64 rw_devices;
+
+ /* Count of missing devices under this fsid excluding seed device. */
u64 missing_devices;
u64 total_rw_bytes;
+
+ /*
+ * Count of devices from btrfs_super_block::num_devices for this fsid,
+ * which includes the seed device, excludes the transient replace-target
+ * device.
+ */
u64 total_devices;
/* Highest generation number of seen devices */
u64 latest_generation;
- struct block_device *latest_bdev;
+ /*
+ * The mount device or a device with highest generation after removal
+ * or replace.
+ */
+ struct btrfs_device *latest_dev;
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
@@ -300,48 +323,62 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet. We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios. We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
*/
-struct btrfs_io_bio {
+struct btrfs_bio {
unsigned int mirror_num;
+
+ /* @device is for stripe IO submission. */
struct btrfs_device *device;
- u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_io_bio but relies on bio being last.
+ * bytes for entire btrfs_bio but relies on bio being last.
*/
struct bio bio;
};
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
{
- return container_of(bio, struct btrfs_io_bio, bio);
+ return container_of(bio, struct btrfs_bio, bio);
}
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
- if (io_bio->csum != io_bio->csum_inline) {
- kfree(io_bio->csum);
- io_bio->csum = NULL;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
}
}
-struct btrfs_bio_stripe {
+struct btrfs_io_stripe {
struct btrfs_device *dev;
u64 physical;
u64 length; /* only used for discard mappings */
};
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ * Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ * Used by submit_stripe_bio() for mapping logical bio
+ * into physical device address.
+ *
+ * - Contain device replace info
+ * Used by handle_ops_on_dev_replace() to copy logical bios
+ * into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
@@ -361,7 +398,7 @@ struct btrfs_bio {
* so raid_map[0] is the start of our full stripe
*/
u64 *raid_map;
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
struct btrfs_device_info {
@@ -396,11 +433,11 @@ struct map_lookup {
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
+ (sizeof(struct btrfs_io_stripe) * (n)))
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -414,6 +451,22 @@ struct btrfs_balance_control {
struct btrfs_balance_progress stat;
};
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+ u64 devid;
+ u8 *uuid;
+ u8 *fsid;
+ bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+ struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
enum btrfs_map_op {
BTRFS_MAP_READ,
BTRFS_MAP_WRITE,
@@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
}
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num);
+ struct btrfs_io_context **bioc_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret);
+ struct btrfs_io_context **bioc_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
u64 devid,
const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
- const char *device_path, u64 devid,
+ struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 8a4514283a4b..2837b4c8424d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* matches our target xattr, so lets check.
*/
ret = 0;
- btrfs_assert_tree_locked(path->nodes[0]);
+ btrfs_assert_tree_write_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 47af1ab3bf12..67d932d70798 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -4,6 +4,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
+#include <linux/atomic.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
@@ -39,12 +40,30 @@
#define BTRFS_NR_SB_LOG_ZONES 2
/*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
* Maximum supported zone size. Currently, SMR disks have a zone size of
* 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
* expect the zone size to become larger than 8GiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+ return (zone->cond == BLK_ZONE_COND_FULL) ||
+ (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
struct blk_zone *zones = data;
@@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
+ int i;
- ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
- zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
-
- empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
- empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
- full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
- full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+ full[i] = sb_zone_is_full(&zones[i]);
+ }
/*
* Possible states of log buffer zones
@@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ unsigned int max_active_zones;
+ unsigned int nactive;
sector_t nr_sectors;
sector_t sector = 0;
struct blk_zone *zones = NULL;
@@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
+ max_active_zones = queue_max_active_zones(queue);
+ if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+ btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+ rcu_str_deref(device->name), max_active_zones,
+ BTRFS_MIN_ACTIVE_ZONES);
+ ret = -EINVAL;
+ goto out;
+ }
+ zone_info->max_active_zones = max_active_zones;
+
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
@@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
}
/* Get zones type */
+ nactive = 0;
while (sector < nr_sectors) {
nr_zones = BTRFS_REPORT_NR_ZONES;
ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
@@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
for (i = 0; i < nr_zones; i++) {
if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
__set_bit(nreported, zone_info->seq_zones);
- if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ switch (zones[i].cond) {
+ case BLK_ZONE_COND_EMPTY:
__set_bit(nreported, zone_info->empty_zones);
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ __set_bit(nreported, zone_info->active_zones);
+ nactive++;
+ break;
+ }
nreported++;
}
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
@@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ if (max_active_zones) {
+ if (nactive > max_active_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_str_deref(device->name),
+ max_active_zones);
+ ret = -EIO;
+ goto out;
+ }
+ atomic_set(&zone_info->active_zones_left,
+ max_active_zones - nactive);
+ }
+
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
out:
kfree(zones);
out_free_zone_info:
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->empty_zones);
bitmap_free(zone_info->seq_zones);
kfree(zone_info);
@@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return;
+ bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->seq_zones);
bitmap_free(zone_info->empty_zones);
kfree(zone_info);
@@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
/*
* stripe_size is always aligned to BTRFS_STRIPE_LEN in
- * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * btrfs_create_chunk(). Since we want stripe_len == zone_size,
* check the alignment here.
*/
if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
@@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
- ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+ ASSERT(sb_zone_is_full(reset));
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
reset->start, reset->len,
@@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset->wp = reset->start;
}
} else if (ret != -ENOENT) {
- /* For READ, we want the precious one */
+ /*
+ * For READ, we want the previous one. Move write pointer to
+ * the end of a zone, if it is at the head of a zone.
+ */
+ u64 zone_end = 0;
+
if (wp == zones[0].start << SECTOR_SHIFT)
- wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ zone_end = zones[1].start + zones[1].capacity;
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ zone_end = zones[0].start + zones[0].capacity;
+ if (zone_end)
+ wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+ BTRFS_SUPER_INFO_SIZE);
+
wp -= BTRFS_SUPER_INFO_SIZE;
}
@@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
return true;
}
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
struct blk_zone *zone;
+ int i;
if (!is_sb_log_zone(zinfo, mirror))
- return;
+ return 0;
zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
- if (zone->cond != BLK_ZONE_COND_FULL) {
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ /* Advance the next zone */
+ if (zone->cond == BLK_ZONE_COND_FULL) {
+ zone++;
+ continue;
+ }
+
if (zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+ zone->wp += SUPER_INFO_SECTORS;
+
+ if (sb_zone_is_full(zone)) {
+ /*
+ * No room left to write new superblock. Since
+ * superblock is written with REQ_SYNC, it is safe to
+ * finish the zone now.
+ *
+ * If the write pointer is exactly at the capacity,
+ * explicit ZONE_FINISH is not necessary.
+ */
+ if (zone->wp != zone->start + zone->capacity) {
+ int ret;
+
+ ret = blkdev_zone_mgmt(device->bdev,
+ REQ_OP_ZONE_FINISH, zone->start,
+ zone->len, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
- if (zone->wp == zone->start + zone->len)
+ zone->wp = zone->start + zone->len;
zone->cond = BLK_ZONE_COND_FULL;
-
- return;
+ }
+ return 0;
}
- zone++;
- ASSERT(zone->cond != BLK_ZONE_COND_FULL);
- if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
- zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
-
- if (zone->wp == zone->start + zone->len)
- zone->cond = BLK_ZONE_COND_FULL;
+ /* All the zones are FULL. Should not reach here. */
+ ASSERT(0);
+ return -EIO;
}
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
@@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
return pos;
}
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return true;
+
+ if (!test_bit(zno, zone_info->active_zones)) {
+ /* Active zone left? */
+ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+ return false;
+ if (test_and_set_bit(zno, zone_info->active_zones)) {
+ /* Someone already set the bit */
+ atomic_inc(&zone_info->active_zones_left);
+ }
+ }
+
+ return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return;
+
+ if (test_and_clear_bit(zno, zone_info->active_zones))
+ atomic_inc(&zone_info->active_zones_left);
+}
+
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
@@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
*bytes = length;
while (length) {
btrfs_dev_set_zone_empty(device, physical);
+ btrfs_dev_clear_active_zone(device, physical);
physical += device->zone_info->zone_size;
length -= device->zone_info->zone_size;
}
@@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
int i;
unsigned int nofs_flag;
u64 *alloc_offsets = NULL;
+ u64 *caps = NULL;
+ unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
map = em->map_lookup;
+ cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ if (!cache->physical_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
if (!alloc_offsets) {
- free_extent_map(em);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+ if (!caps) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+ if (!active) {
+ ret = -ENOMEM;
+ goto out;
}
for (i = 0; i < map->num_stripes; i++) {
@@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ caps[i] = (zone.capacity << SECTOR_SHIFT);
+
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
@@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
alloc_offsets[i] = 0;
break;
case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = fs_info->zone_size;
+ alloc_offsets[i] = caps[i];
break;
default:
/* Partially used zone */
alloc_offsets[i] =
((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(i, active);
break;
}
+
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
}
if (num_sequential > 0)
@@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* calculate_alloc_pointer() which takes extent buffer
* locks to avoid deadlock.
*/
+
+ /* Zone capacity is always zone size in emulation */
+ cache->zone_capacity = cache->length;
if (new) {
cache->alloc_offset = 0;
goto out;
@@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = caps[0];
+ cache->zone_is_active = test_bit(0, active);
break;
case BTRFS_BLOCK_GROUP_DUP:
case BTRFS_BLOCK_GROUP_RAID1:
@@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (cache->zone_is_active) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1218,6 +1390,14 @@ out:
ret = -EIO;
}
+ if (cache->alloc_offset > cache->zone_capacity) {
+ btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+ cache->alloc_offset, cache->zone_capacity,
+ cache->start);
+ ret = -EIO;
+ }
+
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
btrfs_err(fs_info,
@@ -1229,6 +1409,12 @@ out:
if (!ret)
cache->meta_write_pointer = cache->alloc_offset + cache->start;
+ if (ret) {
+ kfree(cache->physical_map);
+ cache->physical_map = NULL;
+ }
+ bitmap_free(active);
+ kfree(caps);
kfree(alloc_offsets);
free_extent_map(em);
@@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
- unusable = cache->alloc_offset - cache->used;
- free = cache->length - cache->alloc_offset;
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
-
- /* Should not have any excluded extents. Just in case, though */
- btrfs_free_excluded_extents(cache);
}
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
@@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!is_data_inode(&inode->vfs_inode))
return false;
+ /*
+ * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * extent layout the relocation code has.
+ * Furthermore we have set aside own block-group from which only the
+ * relocation "process" can allocate and make sure only one process at a
+ * time can add pages to an extent that gets relocated, so it's safe to
+ * use regular REQ_OP_WRITE for this special case.
+ */
+ if (btrfs_is_data_reloc_root(inode->root))
+ return false;
+
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
@@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len
static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
struct blk_zone *zone)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 mapped_length = PAGE_SIZE;
unsigned int nofs_flag;
int nmirrors;
int i, ret;
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < PAGE_SIZE) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ btrfs_put_bioc(bioc);
return -EIO;
}
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
return -EINVAL;
nofs_flag = memalloc_nofs_save();
- nmirrors = (int)bbio->num_stripes;
+ nmirrors = (int)bioc->num_stripes;
for (i = 0; i < nmirrors; i++) {
- u64 physical = bbio->stripes[i].physical;
- struct btrfs_device *dev = bbio->stripes[i].dev;
+ u64 physical = bioc->stripes[i].physical;
+ struct btrfs_device *dev = bioc->stripes[i].dev;
/* Missing device */
if (!dev->bdev)
@@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
return device;
}
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ bool ret;
+
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return true;
+
+ map = block_group->physical_map;
+ /* Currently support SINGLE profile only */
+ ASSERT(map->num_stripes == 1);
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ return true;
+
+ spin_lock(&block_group->lock);
+
+ if (block_group->zone_is_active) {
+ ret = true;
+ goto out_unlock;
+ }
+
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
+
+ spin_unlock(&block_group->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(list_empty(&block_group->active_bg_list));
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ return true;
+
+out_unlock:
+ spin_unlock(&block_group->lock);
+ return ret;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ map = block_group->physical_map;
+ /* Currently support SINGLE profile only */
+ ASSERT(map->num_stripes == 1);
+
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ return 0;
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ /* Check if we have unwritten allocated space */
+ if ((block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+ block_group->alloc_offset > block_group->meta_write_pointer) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
+
+ block_group->zone_is_active = 0;
+ block_group->alloc_offset = block_group->zone_capacity;
+ block_group->free_space_ctl->free_space = 0;
+ btrfs_clear_treelog_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
+ btrfs_dec_block_group_ro(block_group);
+
+ if (!ret) {
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+ }
+
+ return ret;
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+{
+ struct btrfs_device *device;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_devices->fs_info))
+ return true;
+
+ /* Non-single profiles are not supported yet */
+ if (raid_index != BTRFS_RAID_SINGLE)
+ return false;
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zinfo->max_active_zones ||
+ atomic_read(&zinfo->active_zones_left)) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+ struct btrfs_block_group *block_group;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ ASSERT(block_group);
+
+ if (logical + length < block_group->start + block_group->zone_capacity)
+ goto out;
+
+ spin_lock(&block_group->lock);
+
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ goto out;
+ }
+
+ block_group->zone_is_active = 0;
+ /* We should have consumed all the free space */
+ ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+ ASSERT(block_group->free_space_ctl->free_space == 0);
+ btrfs_clear_treelog_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ map = block_group->physical_map;
+ device = map->stripes[0].dev;
+ physical = map->stripes[0].physical;
+
+ if (!device->zone_info->max_active_zones)
+ goto out;
+
+ btrfs_dev_clear_active_zone(device, physical);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ btrfs_put_block_group(block_group);
+
+out:
+ btrfs_put_block_group(block_group);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg == bg->start)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 4b299705bb12..e53ab7b96437 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -23,8 +23,11 @@ struct btrfs_zoned_device_info {
u64 zone_size;
u8 zone_size_shift;
u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
unsigned long *seq_zones;
unsigned long *empty_zones;
+ unsigned long *active_zones;
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};
@@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
u64 *bytenr_ret);
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
u64 hole_end, u64 num_bytes);
@@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ int raid_index);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
return 0;
}
-static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
-{ }
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ return 0;
+}
static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
@@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device(
return ERR_PTR(-EOPNOTSUPP);
}
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ int raid_index)
+{
+ return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/buffer.c b/fs/buffer.c
index c615387aedca..46bc589b7a03 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
+ loff_t sz = bdev_nr_bytes(bdev);
if (sz) {
unsigned int sizebits = blksize_bits(size);
@@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
- sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
+ sector_t end_block = blkdev_max_block(bdev, size);
do {
if (!buffer_mapped(bh)) {
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index fac2e8e7b533..effe37ef8629 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
/*
* Handle completion of a read from the cache.
*/
-static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_read_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
if (ki->term_func) {
if (ret >= 0)
@@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_read_complete(&ki->iocb, ret, 0);
+ cachefiles_read_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
@@ -159,12 +159,12 @@ presubmission_error:
/*
* Handle completion of a write to the cache.
*/
-static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+static void cachefiles_write_complete(struct kiocb *iocb, long ret)
{
struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
struct inode *inode = file_inode(ki->iocb.ki_filp);
- _enter("%ld,%ld", ret, ret2);
+ _enter("%ld", ret);
/* Tell lockdep we inherited freeze protection from submission thread */
__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
@@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
fallthrough;
default:
ki->was_async = false;
- cachefiles_write_complete(&ki->iocb, ret, 0);
+ cachefiles_write_complete(&ki->iocb, ret);
if (ret > 0)
ret = 0;
break;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8ffc40e84a59..fcf4f3b72923 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
struct wait_page_key *key = _key;
- struct page *page = wait->private;
+ struct folio *folio = wait->private;
ASSERT(key);
_enter("{%lu},%u,%d,{%p,%u}",
monitor->netfs_page->index, mode, sync,
- key->page, key->bit_nr);
+ key->folio, key->bit_nr);
- if (key->page != page || key->bit_nr != PG_locked)
+ if (key->folio != folio || key->bit_nr != PG_locked)
return 0;
- _debug("--- monitor %p %lx ---", page, page->flags);
+ _debug("--- monitor %p %lx ---", folio, folio->flags);
- if (!PageUptodate(page) && !PageError(page)) {
+ if (!folio_test_uptodate(folio) && !folio_test_error(folio)) {
/* unlocked, not uptodate and not erronous? */
_debug("page probably truncated");
}
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
put_page(backpage2);
INIT_LIST_HEAD(&monitor->op_link);
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
if (trylock_page(backpage)) {
ret = -EIO;
@@ -294,7 +294,7 @@ monitor_backing_page:
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
- add_page_wait_queue(backpage, &monitor->monitor);
+ folio_add_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
/* but the page may have been read before the monitor was
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b39aebc2ed95..e53c8541f5b2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -63,7 +63,7 @@
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct page *page, void **_fsdata);
+ struct folio *folio, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
@@ -317,13 +317,14 @@ static const struct netfs_read_request_ops ceph_netfs_read_ops = {
};
/* read a single page, without unlocking it. */
-static int ceph_readpage(struct file *file, struct page *page)
+static int ceph_readpage(struct file *file, struct page *subpage)
{
+ struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_vino vino = ceph_vino(inode);
- u64 off = page_offset(page);
- u64 len = thp_size(page);
+ size_t len = folio_size(folio);
+ u64 off = folio_file_pos(folio);
if (ci->i_inline_version != CEPH_INLINE_NONE) {
/*
@@ -331,19 +332,19 @@ static int ceph_readpage(struct file *file, struct page *page)
* into page cache while getting Fcr caps.
*/
if (off == 0) {
- unlock_page(page);
+ folio_unlock(folio);
return -EINVAL;
}
- zero_user_segment(page, 0, thp_size(page));
- SetPageUptodate(page);
- unlock_page(page);
+ zero_user_segment(&folio->page, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
- dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
- vino.ino, vino.snap, file, off, len, page, page->index);
+ dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
+ vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
- return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL);
+ return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
}
static void ceph_readahead(struct readahead_control *ractl)
@@ -1187,18 +1188,18 @@ ceph_find_incompatible(struct page *page)
}
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct page *page, void **_fsdata)
+ struct folio *folio, void **_fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(page);
+ snapc = ceph_find_incompatible(folio_page(folio, 0));
if (snapc) {
int r;
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (IS_ERR(snapc))
return PTR_ERR(snapc);
@@ -1216,12 +1217,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* clean, or already dirty within the same snap context.
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len, unsigned aop_flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = NULL;
+ struct folio *folio = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
int r;
@@ -1230,39 +1231,43 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
* for inline_version sent to the MDS.
*/
if (ci->i_inline_version != CEPH_INLINE_NONE) {
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
+ unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+ if (aop_flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+ if (!folio)
return -ENOMEM;
/*
* The inline_version on a new inode is set to 1. If that's the
- * case, then the page is brand new and isn't yet Uptodate.
+ * case, then the folio is brand new and isn't yet Uptodate.
*/
r = 0;
if (index == 0 && ci->i_inline_version != 1) {
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
ci->i_inline_version);
r = -EINVAL;
}
goto out;
}
- zero_user_segment(page, 0, thp_size(page));
- SetPageUptodate(page);
+ zero_user_segment(&folio->page, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
goto out;
}
- r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL,
+ r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
&ceph_netfs_read_ops, NULL);
out:
if (r == 0)
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
if (r < 0) {
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
} else {
- WARN_ON_ONCE(!PageLocked(page));
- *pagep = page;
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ *pagep = &folio->page;
}
return r;
}
@@ -1273,32 +1278,33 @@ out:
*/
static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct page *subpage, void *fsdata)
{
+ struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
bool check_cap = false;
- dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
- inode, page, (int)pos, (int)copied, (int)len);
+ dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
+ inode, folio, (int)pos, (int)copied, (int)len);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
/* just return that nothing was copied on a short copy */
if (copied < len) {
copied = 0;
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/* did file size increase? */
if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 220a41831b46..02a0a0fd9ccd 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1027,7 +1027,7 @@ static void ceph_aio_complete(struct inode *inode,
ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD));
- aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+ aio_req->iocb->ki_complete(aio_req->iocb, ret);
ceph_free_cap_flush(aio_req->prealloc_cf);
kfree(aio_req);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 74c227d9abf5..d1f154aec249 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -305,9 +305,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /* No mandatory locks */
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 159a1ffa4f4b..fcf7dfdecf96 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1311,7 +1311,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
int err;
err = security_dentry_init_security(dentry, mode, &dentry->d_name,
- &as_ctx->sec_ctx,
+ &name, &as_ctx->sec_ctx,
&as_ctx->sec_ctxlen);
if (err < 0) {
WARN_ON_ONCE(err != -EOPNOTSUPP);
@@ -1335,7 +1335,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
* It only supports single security module and only selinux has
* dentry_init_security hook.
*/
- name = XATTR_NAME_SELINUX;
name_len = strlen(name);
err = ceph_pagelist_reserve(pagelist,
4 * 2 + name_len + as_ctx->sec_ctxlen);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9fa930dfd78d..dca42aa87d30 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -38,7 +38,6 @@
#include <linux/key-type.h>
#include "cifs_spnego.h"
#include "fscache.h"
-#include "smb2pdu.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e916470468ea..abff31dcd005 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -20,6 +20,7 @@
#include <crypto/internal/hash.h>
#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
+#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
@@ -776,7 +777,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
static inline void
revert_current_mid_from_hdr(struct TCP_Server_Info *server,
- const struct smb2_sync_hdr *shdr)
+ const struct smb2_hdr *shdr)
{
unsigned int num = le16_to_cpu(shdr->CreditCharge);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c3b94c1e4591..0abbff4e4135 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -677,7 +677,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
static unsigned int
smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
/*
* SMB1 does not use credits.
@@ -794,7 +794,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
*/
}
- kfree(server->hostname);
kfree(server);
length = atomic_dec_return(&tcpSesAllocCount);
@@ -878,7 +877,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
static void
smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
int scredits, in_flight;
/*
@@ -1235,6 +1234,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
return 0;
+ if (strcasecmp(server->hostname, ctx->server_hostname))
+ return 0;
+
if (!match_address(server, addr,
(struct sockaddr *)&ctx->srcaddr))
return 0;
@@ -1336,6 +1338,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
kfree(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
+ kfree(server->hostname);
task = xchg(&server->tsk, NULL);
if (task)
@@ -1361,14 +1364,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
goto out_err;
}
+ tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL);
+ if (!tcp_ses->hostname) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
- tcp_ses->hostname = extract_hostname(ctx->UNC);
- if (IS_ERR(tcp_ses->hostname)) {
- rc = PTR_ERR(tcp_ses->hostname);
- goto out_err_crypto_release;
- }
tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
tcp_ses->noblockcnt = ctx->rootfs;
@@ -1497,8 +1501,7 @@ out_err_crypto_release:
out_err:
if (tcp_ses) {
- if (!IS_ERR(tcp_ses->hostname))
- kfree(tcp_ses->hostname);
+ kfree(tcp_ses->hostname);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -2646,11 +2649,12 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = 0;
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+ trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
sock_release(socket);
server->ssocket = NULL;
return rc;
}
-
+ trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
if (sport == htons(RFC1001_PORT))
rc = ip_rfc1001_connect(server);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 13f3182cf796..1b855fcb179e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3184,7 +3184,7 @@ restart_loop:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
@@ -3917,7 +3917,7 @@ again:
mutex_unlock(&ctx->aio_mutex);
if (ctx->iocb && ctx->iocb->ki_complete)
- ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc);
else
complete(&ctx->done);
}
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 3109def8e199..38d96a480745 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -116,6 +116,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("nosharesock", Opt_nosharesock),
fsparam_flag_no("persistenthandles", Opt_persistent),
fsparam_flag_no("resilienthandles", Opt_resilient),
+ fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay),
fsparam_flag("domainauto", Opt_domainauto),
fsparam_flag("rdma", Opt_rdma),
fsparam_flag("modesid", Opt_modesid),
@@ -318,6 +319,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(mount_options);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
+ DUP_CTX_STR(server_hostname);
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
DUP_CTX_STR(domainname);
@@ -456,6 +458,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
if (!pos)
return -EINVAL;
+ /* record the server hostname */
+ ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL);
+ if (!ctx->server_hostname)
+ return -ENOMEM;
+
/* skip past delimiter */
++pos;
@@ -1383,6 +1390,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
}
break;
+ case Opt_tcp_nodelay:
+ /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */
+ if (result.negated)
+ ctx->sockopt_tcp_nodelay = false;
+ else
+ ctx->sockopt_tcp_nodelay = true;
+ break;
case Opt_domainauto:
ctx->domainauto = true;
break;
@@ -1496,6 +1510,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->username = NULL;
kfree_sensitive(ctx->password);
ctx->password = NULL;
+ kfree(ctx->server_hostname);
+ ctx->server_hostname = NULL;
kfree(ctx->UNC);
ctx->UNC = NULL;
kfree(ctx->source);
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index a42ba71d7a81..b2d22cf9cb18 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -98,6 +98,7 @@ enum cifs_param {
Opt_nosharesock,
Opt_persistent,
Opt_resilient,
+ Opt_tcp_nodelay,
Opt_domainauto,
Opt_rdma,
Opt_modesid,
@@ -166,6 +167,7 @@ struct smb3_fs_context {
char *password;
char *domainname;
char *source;
+ char *server_hostname;
char *UNC;
char *nodename;
char *iocharset; /* local code page for mapping to and from Unicode */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bb1185fff8cc..ba2c3e897b29 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -152,7 +152,7 @@ cifs_buf_get(void)
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
- size_t buf_size = sizeof(struct smb2_sync_hdr);
+ size_t buf_size = sizeof(struct smb2_hdr);
/*
* We could use negotiated size instead of max_msgsize -
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 181514b8770d..194799ddd382 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status)
int
map_smb2_to_linux_error(char *buf, bool log_err)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
unsigned int i;
int rc = -EIO;
__le32 smb2err = shdr->Status;
if (smb2err == 0) {
- trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
+ trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId));
return 0;
}
@@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err)
cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
__le32_to_cpu(smb2err), rc);
- trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command),
- le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
+ trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId),
+ le32_to_cpu(smb2err), rc);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 29b5554f6263..cdcdef32759e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -8,7 +8,6 @@
*
*/
#include <linux/ctype.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -19,7 +18,7 @@
#include "nterr.h"
static int
-check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
+check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid)
{
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
@@ -81,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
-#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp))
+#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp))
-static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
__u32 non_ctxlen)
{
__u16 neg_count;
@@ -135,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
int
smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
- struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
+ struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
__u64 mid;
__u32 clc_len; /* calculated length */
int command;
- int pdu_size = sizeof(struct smb2_sync_pdu);
- int hdr_size = sizeof(struct smb2_sync_hdr);
+ int pdu_size = sizeof(struct smb2_pdu);
+ int hdr_size = sizeof(struct smb2_hdr);
/*
* Add function to do table lookup of StructureSize by command
@@ -155,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
- if (ses->Suid == thdr->SessionId)
+ if (ses->Suid == le64_to_cpu(thdr->SessionId))
break;
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -296,7 +295,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
* area and the offset to it (from the beginning of the smb are also returned.
*/
char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
{
*off = 0;
*len = 0;
@@ -401,8 +400,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
unsigned int
smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
{
- struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
- struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
+ struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+ struct smb2_hdr *shdr = &pdu->hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
/* Structure Size has already been checked to make sure it is 64 */
@@ -669,7 +668,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "Checking for oplock break\n");
- if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
+ if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
return false;
if (rsp->StructureSize !=
@@ -816,25 +815,25 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
int
smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *sync_hdr = mid->resp_buf;
+ struct smb2_hdr *hdr = mid->resp_buf;
struct smb2_create_rsp *rsp = mid->resp_buf;
struct cifs_tcon *tcon;
int rc;
- if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE ||
- sync_hdr->Status != STATUS_SUCCESS)
+ if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE ||
+ hdr->Status != STATUS_SUCCESS)
return 0;
- tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
- sync_hdr->TreeId);
+ tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId),
+ le32_to_cpu(hdr->Id.SyncId.TreeId));
if (!tcon)
return -ENOENT;
rc = __smb2_handle_cancelled_cmd(tcon,
- le16_to_cpu(sync_hdr->Command),
- le64_to_cpu(sync_hdr->MessageId),
- rsp->PersistentFileId,
- rsp->VolatileFileId);
+ le16_to_cpu(hdr->Command),
+ le64_to_cpu(hdr->MessageId),
+ le64_to_cpu(rsp->PersistentFileId),
+ le64_to_cpu(rsp->VolatileFileId));
if (rc)
cifs_put_tcon(tcon);
@@ -856,10 +855,10 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
{
int i, rc;
struct sdesc *d;
- struct smb2_sync_hdr *hdr;
+ struct smb2_hdr *hdr;
struct TCP_Server_Info *server = cifs_ses_server(ses);
- hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ hdr = (struct smb2_hdr *)iov[0].iov_base;
/* neg prot are always taken */
if (hdr->Command == SMB2_NEGOTIATE)
goto ok;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bda606dc72b1..7acf71defea7 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -325,7 +325,7 @@ static struct mid_q_entry *
__smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
{
struct mid_q_entry *mid;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -367,11 +367,11 @@ static void
smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
- shdr->ProcessId);
+ shdr->Id.SyncId.ProcessId);
cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
server->ops->calc_smb_size(buf, server));
#endif
@@ -885,10 +885,10 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
atomic_inc(&tcon->num_remote_opens);
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- oparms.fid->persistent_fid = o_rsp->PersistentFileId;
- oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+ oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId);
+ oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId);
#ifdef CONFIG_CIFS_DEBUG2
- oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+ oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
tcon->crfid.tcon = tcon;
@@ -2391,12 +2391,12 @@ again:
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) {
+ if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc);
goto qdf_free;
}
- fid->persistent_fid = op_rsp->PersistentFileId;
- fid->volatile_fid = op_rsp->VolatileFileId;
+ fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId);
+ fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId);
/* Anything else than ENODATA means a genuine error */
if (rc && rc != -ENODATA) {
@@ -2410,7 +2410,7 @@ again:
atomic_inc(&tcon->num_remote_opens);
qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
- if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, fid->persistent_fid,
tcon->tid, tcon->ses->Suid, 0, 0);
srch_inf->endOfSearch = true;
@@ -2462,7 +2462,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
static bool
smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
int scredits, in_flight;
if (shdr->Status != STATUS_PENDING)
@@ -2489,13 +2489,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
static bool
smb2_is_session_expired(char *buf)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
shdr->Status != STATUS_USER_SESSION_DELETED)
return false;
- trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId,
+ trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
le16_to_cpu(shdr->Command),
le64_to_cpu(shdr->MessageId));
cifs_dbg(FYI, "Session expired or deleted\n");
@@ -2506,7 +2507,7 @@ smb2_is_session_expired(char *buf)
static bool
smb2_is_status_io_timeout(char *buf)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
if (shdr->Status == STATUS_IO_TIMEOUT)
return true;
@@ -2517,7 +2518,7 @@ smb2_is_status_io_timeout(char *buf)
static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct list_head *tmp, *tmp1;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -2530,7 +2531,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- if (tcon->tid == shdr->TreeId) {
+ if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
tcon->need_reconnect = true;
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
@@ -2558,9 +2559,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
void
smb2_set_related(struct smb_rqst *rqst)
{
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
if (shdr == NULL) {
cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
return;
@@ -2573,13 +2574,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
void
smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
{
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = ses->server;
unsigned long len = smb_rqst_len(server, rqst);
int i, num_padding;
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
if (shdr == NULL) {
cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n");
return;
@@ -3124,7 +3125,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype, rsp_iov);
create_rsp = rsp_iov[0].iov_base;
- if (create_rsp && create_rsp->sync_hdr.Status)
+ if (create_rsp && create_rsp->hdr.Status)
err_iov = rsp_iov[0];
ioctl_rsp = rsp_iov[1].iov_base;
@@ -4369,8 +4370,8 @@ static void
fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
struct smb_rqst *old_rq, __le16 cipher_type)
{
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)old_rq->rq_iov[0].iov_base;
memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -4496,7 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
+ rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
enc ? "en" : "de");
@@ -4788,7 +4789,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_page_idx;
unsigned int pad_len;
struct cifs_readdata *rdata = mid->callback_data;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct bio_vec *bvec = NULL;
struct iov_iter iter;
struct kvec iov;
@@ -5117,7 +5118,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
{
int ret, length;
char *buf = server->smallbuf;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
unsigned int pdu_length = server->pdu_size;
unsigned int buf_size;
struct mid_q_entry *mid_entry;
@@ -5147,7 +5148,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
next_is_large = server->large_buf;
one_more:
- shdr = (struct smb2_sync_hdr *)buf;
+ shdr = (struct smb2_hdr *)buf;
if (shdr->NextCommand) {
if (next_is_large)
next_buffer = (char *)cifs_buf_get();
@@ -5213,7 +5214,7 @@ smb3_receive_transform(struct TCP_Server_Info *server,
unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
if (pdu_length < sizeof(struct smb2_transform_hdr) +
- sizeof(struct smb2_sync_hdr)) {
+ sizeof(struct smb2_hdr)) {
cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
cifs_reconnect(server);
@@ -5246,7 +5247,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
static int
smb2_next_header(char *buf)
{
- struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
@@ -5788,7 +5789,7 @@ struct smb_version_values smb20_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5809,7 +5810,7 @@ struct smb_version_values smb21_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5830,7 +5831,7 @@ struct smb_version_values smb3any_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5851,7 +5852,7 @@ struct smb_version_values smbdefault_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5872,7 +5873,7 @@ struct smb_version_values smb30_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5893,7 +5894,7 @@ struct smb_version_values smb302_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5914,7 +5915,7 @@ struct smb_version_values smb311_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_sync_hdr),
+ .header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 7829c590eeac..d2ecb2ea37c0 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -23,7 +23,6 @@
#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
@@ -84,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon)
}
static void
-smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
+smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
const struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
{
@@ -104,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
} else {
shdr->CreditRequest = cpu_to_le16(2);
}
- shdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+ shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid);
if (!tcon)
goto out;
@@ -115,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
shdr->CreditCharge = cpu_to_le16(1);
/* else CreditCharge MBZ */
- shdr->TreeId = tcon->tid;
+ shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid);
/* Uid is not converted */
if (tcon->ses)
- shdr->SessionId = tcon->ses->Suid;
+ shdr->SessionId = cpu_to_le64(tcon->ses->Suid);
/*
* If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
@@ -331,7 +330,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
void *buf,
unsigned int *total_len)
{
- struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf;
+ struct smb2_pdu *spdu = (struct smb2_pdu *)buf;
/* lookup word count ie StructureSize from table */
__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
@@ -341,10 +340,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
*/
memset(buf, 0, 256);
- smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server);
+ smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server);
spdu->StructureSize2 = cpu_to_le16(parmsize);
- *total_len = parmsize + sizeof(struct smb2_sync_hdr);
+ *total_len = parmsize + sizeof(struct smb2_hdr);
}
/*
@@ -367,7 +366,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
}
fill_small_buf(smb2_command, tcon, server,
- (struct smb2_sync_hdr *)(*request_buf),
+ (struct smb2_hdr *)(*request_buf),
total_len);
if (tcon != NULL) {
@@ -414,8 +413,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
pneg_ctxt->DataLength = cpu_to_le16(38);
pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
- pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE);
- get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE);
+ pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+ get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
}
@@ -857,7 +856,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
if (rc)
return rc;
- req->sync_hdr.SessionId = 0;
+ req->hdr.SessionId = 0;
memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
@@ -1018,7 +1017,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
- (struct smb2_sync_hdr *)rsp);
+ (struct smb2_hdr *)rsp);
/*
* See MS-SMB2 section 2.2.4: if no blob, client picks default which
* for us will be
@@ -1250,23 +1249,23 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
return rc;
if (sess_data->ses->binding) {
- req->sync_hdr.SessionId = sess_data->ses->Suid;
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid);
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
req->PreviousSessionId = 0;
req->Flags = SMB2_SESSION_REQ_FLAG_BINDING;
} else {
/* First session, not a reauthenticate */
- req->sync_hdr.SessionId = 0;
+ req->hdr.SessionId = 0;
/*
* if reconnect, we need to send previous sess id
* otherwise it is 0
*/
- req->PreviousSessionId = sess_data->previous_session;
+ req->PreviousSessionId = cpu_to_le64(sess_data->previous_session);
req->Flags = 0; /* MBZ */
}
/* enough to enable echos and oplocks and one max size write */
- req->sync_hdr.CreditRequest = cpu_to_le16(130);
+ req->hdr.CreditRequest = cpu_to_le16(130);
/* only one of SMB2 signing flags may be set in SMB2 request */
if (server->sign)
@@ -1425,7 +1424,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
/* keep session id and flags if binding */
if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1501,7 +1500,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* If true, rc here is expected and not an error */
if (sess_data->buf0_type != CIFS_NO_BUFFER &&
- rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+ rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
rc = 0;
if (rc)
@@ -1523,7 +1522,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* keep existing ses id and flags if binding */
if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1558,7 +1557,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
goto out;
req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
- req->sync_hdr.SessionId = ses->Suid;
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
sess_data->nls_cp);
@@ -1584,7 +1583,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
/* keep existing ses id and flags if binding */
if (!ses->binding) {
- ses->Suid = rsp->sync_hdr.SessionId;
+ ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
}
@@ -1715,12 +1714,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
return rc;
/* since no tcon, smb2_init can not do this, so do here */
- req->sync_hdr.SessionId = ses->Suid;
+ req->hdr.SessionId = cpu_to_le64(ses->Suid);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
flags |= CIFS_TRANSFORM_REQ;
else if (server->sign)
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
flags |= CIFS_NO_RSP_BUF;
@@ -1828,14 +1827,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
!(ses->session_flags &
(SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
((ses->user_name != NULL) || (ses->sectype == Kerberos)))
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
/* Need 64 for max size write so ask for more in case not there yet */
- req->sync_hdr.CreditRequest = cpu_to_le16(64);
+ req->hdr.CreditRequest = cpu_to_le16(64);
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -1871,7 +1870,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tidStatus = CifsGood;
tcon->need_reconnect = false;
- tcon->tid = rsp->sync_hdr.TreeId;
+ tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1892,9 +1891,8 @@ tcon_exit:
return rc;
tcon_error_exit:
- if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+ if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
- }
goto tcon_exit;
}
@@ -2608,7 +2606,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, utf16_path);
@@ -2672,11 +2670,13 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
}
rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
- trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid,
+ trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId),
+ tcon->tid,
ses->Suid, CREATE_NOT_FILE,
FILE_WRITE_ATTRIBUTES);
- SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId);
+ SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId),
+ le64_to_cpu(rsp->VolatileFileId));
/* Eventually save off posix specific response info and timestaps */
@@ -2740,7 +2740,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (tcon->share_flags & SHI1005_FLAGS_DFS) {
int name_len;
- req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+ req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
&name_len,
tcon->treeName, path);
@@ -2943,16 +2943,17 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
goto creat_exit;
} else
- trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
+ trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId),
+ tcon->tid,
ses->Suid, oparms->create_options,
oparms->desired_access);
atomic_inc(&tcon->num_remote_opens);
- oparms->fid->persistent_fid = rsp->PersistentFileId;
- oparms->fid->volatile_fid = rsp->VolatileFileId;
+ oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId);
+ oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId);
oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
- oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
+ oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId);
#endif /* CIFS_DEBUG2 */
if (buf) {
@@ -3052,7 +3053,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
* response size smaller.
*/
req->MaxOutputResponse = cpu_to_le32(max_response_size);
- req->sync_hdr.CreditCharge =
+ req->hdr.CreditCharge =
cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
SMB2_MAX_BUFFER_SIZE));
if (is_fsctl)
@@ -3062,7 +3063,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
- req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->hdr.Flags |= SMB2_FLAGS_SIGNED;
return 0;
}
@@ -3236,8 +3237,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (rc)
return rc;
- req->PersistentFileId = persistent_fid;
- req->VolatileFileId = volatile_fid;
+ req->PersistentFileId = cpu_to_le64(persistent_fid);
+ req->VolatileFileId = cpu_to_le64(volatile_fid);
if (query_attrs)
req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
else
@@ -3600,8 +3601,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
if (rc)
return rc;
- req->PersistentFileId = persistent_fid;
- req->VolatileFileId = volatile_fid;
+ req->PersistentFileId = cpu_to_le64(persistent_fid);
+ req->VolatileFileId = cpu_to_le64(volatile_fid);
/* See note 354 of MS-SMB2, 64K max */
req->OutputBufferLength =
cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
@@ -3687,7 +3688,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED
|| mid->mid_state == MID_RESPONSE_MALFORMED) {
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
}
@@ -3787,7 +3788,7 @@ SMB2_echo(struct TCP_Server_Info *server)
if (rc)
return rc;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
iov[0].iov_len = total_len;
iov[0].iov_base = (char *)req;
@@ -3823,8 +3824,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
if (rc)
return rc;
- req->PersistentFileId = persistent_fid;
- req->VolatileFileId = volatile_fid;
+ req->PersistentFileId = cpu_to_le64(persistent_fid);
+ req->VolatileFileId = cpu_to_le64(volatile_fid);
iov[0].iov_base = (char *)req;
iov[0].iov_len = total_len;
@@ -3890,8 +3891,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
unsigned int remaining_bytes, int request_type)
{
int rc = -EACCES;
- struct smb2_read_plain_req *req = NULL;
- struct smb2_sync_hdr *shdr;
+ struct smb2_read_req *req = NULL;
+ struct smb2_hdr *shdr;
struct TCP_Server_Info *server = io_parms->server;
rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server,
@@ -3902,11 +3903,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (server == NULL)
return -ECONNABORTED;
- shdr = &req->sync_hdr;
- shdr->ProcessId = cpu_to_le32(io_parms->pid);
+ shdr = &req->hdr;
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = io_parms->persistent_fid;
- req->VolatileFileId = io_parms->volatile_fid;
+ req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
+ req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
req->ReadChannelInfoOffset = 0; /* reserved */
req->ReadChannelInfoLength = 0; /* reserved */
req->Channel = 0; /* reserved */
@@ -3940,7 +3941,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
if (need_invalidate)
req->Channel = SMB2_CHANNEL_RDMA_V1;
req->ReadChannelInfoOffset =
- cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer));
+ cpu_to_le16(offsetof(struct smb2_read_req, Buffer));
req->ReadChannelInfoLength =
cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
@@ -3964,10 +3965,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* Related requests use info from previous read request
* in chain.
*/
- shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
- shdr->TreeId = 0xFFFFFFFF;
- req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
- req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
+ shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+ shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF);
+ req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+ req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
}
}
if (remaining_bytes > io_parms->length)
@@ -3985,8 +3986,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct cifs_readdata *rdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
struct TCP_Server_Info *server = rdata->server;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rdata->iov[0].iov_base;
struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
.rq_nvec = 1,
@@ -4072,7 +4073,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
{
int rc, flags = 0;
char *buf;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_io_parms io_parms;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 1 };
@@ -4105,7 +4106,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
rdata->iov[0].iov_base = buf;
rdata->iov[0].iov_len = total_len;
- shdr = (struct smb2_sync_hdr *)buf;
+ shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
@@ -4144,7 +4145,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
{
struct smb_rqst rqst;
int resp_buftype, rc;
- struct smb2_read_plain_req *req = NULL;
+ struct smb2_read_req *req = NULL;
struct smb2_read_rsp *rsp = NULL;
struct kvec iov[1];
struct kvec rsp_iov;
@@ -4178,19 +4179,22 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc != -ENODATA) {
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
- trace_smb3_read_err(xid, req->PersistentFileId,
+ trace_smb3_read_err(xid,
+ le64_to_cpu(req->PersistentFileId),
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length,
rc);
} else
- trace_smb3_read_done(xid, req->PersistentFileId,
- io_parms->tcon->tid, ses->Suid,
- io_parms->offset, 0);
+ trace_smb3_read_done(xid,
+ le64_to_cpu(req->PersistentFileId),
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
- trace_smb3_read_done(xid, req->PersistentFileId,
+ trace_smb3_read_done(xid,
+ le64_to_cpu(req->PersistentFileId),
io_parms->tcon->tid, ses->Suid,
io_parms->offset, io_parms->length);
@@ -4238,7 +4242,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
wdata->result = smb2_check_receive(mid, server, 0);
if (wdata->result != 0)
@@ -4264,7 +4268,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
- credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+ credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
fallthrough;
default:
@@ -4311,7 +4315,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
{
int rc = -EACCES, flags = 0;
struct smb2_write_req *req = NULL;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
struct kvec iov[1];
@@ -4329,11 +4333,11 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- shdr = (struct smb2_sync_hdr *)req;
- shdr->ProcessId = cpu_to_le32(wdata->cfile->pid);
+ shdr = (struct smb2_hdr *)req;
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid);
- req->PersistentFileId = wdata->cfile->fid.persistent_fid;
- req->VolatileFileId = wdata->cfile->fid.volatile_fid;
+ req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid);
+ req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid);
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = 0;
@@ -4430,7 +4434,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata, flags, &wdata->credits);
if (rc) {
- trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+ trace_smb3_write_err(0 /* no xid */,
+ le64_to_cpu(req->PersistentFileId),
tcon->tid, tcon->ses->Suid, wdata->offset,
wdata->bytes, rc);
kref_put(&wdata->refcount, release);
@@ -4481,10 +4486,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = io_parms->persistent_fid;
- req->VolatileFileId = io_parms->volatile_fid;
+ req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
+ req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = 0;
@@ -4512,7 +4517,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
- trace_smb3_write_err(xid, req->PersistentFileId,
+ trace_smb3_write_err(xid,
+ le64_to_cpu(req->PersistentFileId),
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
io_parms->offset, io_parms->length, rc);
@@ -4520,10 +4526,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_dbg(VFS, "Send error in write = %d\n", rc);
} else {
*nbytes = le32_to_cpu(rsp->DataLength);
- trace_smb3_write_done(xid, req->PersistentFileId,
- io_parms->tcon->tid,
- io_parms->tcon->ses->Suid,
- io_parms->offset, *nbytes);
+ trace_smb3_write_done(xid,
+ le64_to_cpu(req->PersistentFileId),
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, *nbytes);
}
cifs_small_buf_release(req);
@@ -4866,7 +4873,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
- rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ rsp->hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, persistent_fid,
tcon->tid, tcon->ses->Suid, index, 0);
srch_inf->endOfSearch = true;
@@ -4914,7 +4921,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (rc)
return rc;
- req->sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
@@ -5074,7 +5081,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
req->VolatileFid = volatile_fid;
req->PersistentFid = persistent_fid;
req->OplockLevel = oplock_level;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
flags |= CIFS_NO_RSP_BUF;
@@ -5376,7 +5383,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.ProcessId = cpu_to_le32(pid);
+ req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
req->LockCount = cpu_to_le16(num_lock);
req->PersistentFileId = persist_fid;
@@ -5452,7 +5459,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- req->sync_hdr.CreditRequest = cpu_to_le16(1);
+ req->hdr.CreditRequest = cpu_to_le16(1);
req->StructureSize = cpu_to_le16(36);
total_len += 12;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f32c99c9ba13..33cfd0a1adf1 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -14,156 +14,12 @@
#include <net/sock.h>
#include "cifsacl.h"
-/*
- * Note that, due to trying to use names similar to the protocol specifications,
- * there are many mixed case field names in the structures below. Although
- * this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
- *
- * SMB2 commands
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
- * (ie no useful data other than the SMB error code itself) and are marked such.
- * Knowing this helps avoid response buffer allocations and copy in some cases.
- */
-
-/* List of commands in host endian */
-#define SMB2_NEGOTIATE_HE 0x0000
-#define SMB2_SESSION_SETUP_HE 0x0001
-#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
-#define SMB2_TREE_CONNECT_HE 0x0003
-#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
-#define SMB2_CREATE_HE 0x0005
-#define SMB2_CLOSE_HE 0x0006
-#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
-#define SMB2_READ_HE 0x0008
-#define SMB2_WRITE_HE 0x0009
-#define SMB2_LOCK_HE 0x000A
-#define SMB2_IOCTL_HE 0x000B
-#define SMB2_CANCEL_HE 0x000C
-#define SMB2_ECHO_HE 0x000D
-#define SMB2_QUERY_DIRECTORY_HE 0x000E
-#define SMB2_CHANGE_NOTIFY_HE 0x000F
-#define SMB2_QUERY_INFO_HE 0x0010
-#define SMB2_SET_INFO_HE 0x0011
-#define SMB2_OPLOCK_BREAK_HE 0x0012
-
-/* The same list in little endian */
-#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
-#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
-#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
-#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
-#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
-#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
-#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
-#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
-#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
-#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
-#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
-#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
-#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
-#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
-#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
-#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
-#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
-#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
-#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
-
-#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
-
-#define NUMBER_OF_SMB2_COMMANDS 0x0013
-
/* 52 transform hdr + 64 hdr + 88 create rsp */
#define SMB2_TRANSFORM_HEADER_SIZE 52
#define MAX_SMB2_HDR_SIZE 204
-#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
-#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
-#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
-
-/*
- * SMB2 Header Definition
- *
- * "MBZ" : Must be Zero
- * "BB" : BugBug, Something to check/review/analyze later
- * "PDU" : "Protocol Data Unit" (ie a network "frame")
- *
- */
-
-#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
-
-struct smb2_sync_hdr {
- __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
- __le16 StructureSize; /* 64 */
- __le16 CreditCharge; /* MBZ */
- __le32 Status; /* Error from server */
- __le16 Command;
- __le16 CreditRequest; /* CreditResponse */
- __le32 Flags;
- __le32 NextCommand;
- __le64 MessageId;
- __le32 ProcessId;
- __u32 TreeId; /* opaque - so do not make little endian */
- __u64 SessionId; /* opaque - so do not make little endian */
- __u8 Signature[16];
-} __packed;
-
/* The total header size for SMB2 read and write */
-#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr))
-
-struct smb2_sync_pdu {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
-#define SMB3_AES_CCM_NONCE 11
-#define SMB3_AES_GCM_NONCE 12
-
-/* Transform flags (for 3.0 dialect this flag indicates CCM */
-#define TRANSFORM_FLAG_ENCRYPTED 0x0001
-struct smb2_transform_hdr {
- __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
- __u8 Signature[16];
- __u8 Nonce[16];
- __le32 OriginalMessageSize;
- __u16 Reserved1;
- __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
- __u64 SessionId;
-} __packed;
-
-/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr_unchained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- __le16 CompressionAlgorithm;
- __le16 Flags;
- __le16 Length; /* if chained it is length, else offset */
-} __packed;
-
-/* See MS-SMB2 2.2.42.1 */
-#define SMB2_COMPRESSION_FLAG_NONE 0x0000
-#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
-
-struct compression_payload_header {
- __le16 CompressionAlgorithm;
- __le16 Flags;
- __le32 Length; /* length of compressed playload including field below if present */
- /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2 */
-struct smb2_compression_transform_hdr_chained {
- __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
- __le32 OriginalCompressedSegmentSize;
- /* struct compression_payload_header[] */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2.2 */
-struct compression_pattern_payload_v1 {
- __le16 Pattern;
- __le16 Reserved1;
- __le16 Reserved2;
- __le32 Repetitions;
-} __packed;
+#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr))
/* See MS-SMB2 2.2.43 */
struct smb2_rdma_transform {
@@ -190,17 +46,6 @@ struct smb2_rdma_crypto_transform {
} __packed;
/*
- * SMB2 flag definitions
- */
-#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */
-#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
-#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
-
-/*
* Definitions for SMB2 Protocol Data Units (network frames)
*
* See MS-SMB2.PDF specification for protocol details.
@@ -214,7 +59,7 @@ struct smb2_rdma_crypto_transform {
#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
struct smb2_err_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize;
__le16 Reserved; /* MBZ */
__le32 ByteCount; /* even if zero, at least one byte follows */
@@ -270,530 +115,6 @@ struct share_redirect_error_context_rsp {
/* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
} __packed;
-#define SMB2_CLIENT_GUID_SIZE 16
-
-struct smb2_negotiate_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 36 */
- __le16 DialectCount;
- __le16 SecurityMode;
- __le16 Reserved; /* MBZ */
- __le32 Capabilities;
- __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
- /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
- __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
- __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
- __le16 Reserved2;
- __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
-} __packed;
-
-/* Dialects */
-#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
-#define SMB20_PROT_ID 0x0202
-#define SMB21_PROT_ID 0x0210
-#define SMB30_PROT_ID 0x0300
-#define SMB302_PROT_ID 0x0302
-#define SMB311_PROT_ID 0x0311
-#define BAD_PROT_ID 0xFFFF
-
-/* SecurityMode flags */
-#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
-#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
-#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
-
-/* Capabilities flags */
-#define SMB2_GLOBAL_CAP_DFS 0x00000001
-#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
-/* Internal types */
-#define SMB2_NT_FIND 0x00100000
-#define SMB2_LARGE_FILES 0x00200000
-
-
-/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
-#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
-#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
-#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
-#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
-#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
-#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
-#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
-#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
-
-struct smb2_neg_context {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
-} __packed;
-
-#define SMB311_LINUX_CLIENT_SALT_SIZE 32
-/* Hash Algorithm Types */
-#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
-#define SMB2_PREAUTH_HASH_SIZE 64
-
-/*
- * SaltLength that the server send can be zero, so the only three required
- * fields (all __le16) end up six bytes total, so the minimum context data len
- * in the response is six bytes which accounts for
- *
- * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
- */
-#define MIN_PREAUTH_CTXT_DATA_LEN 6
-
-struct smb2_preauth_neg_context {
- __le16 ContextType; /* 1 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 HashAlgorithmCount; /* 1 */
- __le16 SaltLength;
- __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
- __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE];
-} __packed;
-
-/* Encryption Algorithms Ciphers */
-#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
-#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
-/* we currently do not request AES256_CCM since presumably GCM faster */
-#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
-#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
-
-/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
-#define MIN_ENCRYPT_CTXT_DATA_LEN 4
-struct smb2_encryption_neg_context {
- __le16 ContextType; /* 2 */
- __le16 DataLength;
- __le32 Reserved;
- /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
- __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
- __le16 Ciphers[3];
-} __packed;
-
-/* See MS-SMB2 2.2.3.1.3 */
-#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
-#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
-#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
-#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
-/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
-#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
-
-/* Compression Flags */
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
-
-struct smb2_compression_capabilities_context {
- __le16 ContextType; /* 3 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 CompressionAlgorithmCount;
- __u16 Padding;
- __u32 Flags;
- __le16 CompressionAlgorithms[3];
- __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
- /* Check if pad needed */
-} __packed;
-
-/*
- * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
- * Its struct simply contains NetName, an array of Unicode characters
- */
-struct smb2_netname_neg_context {
- __le16 ContextType; /* 5 */
- __le16 DataLength;
- __le32 Reserved;
- __le16 NetName[]; /* hostname of target converted to UCS-2 */
-} __packed;
-
-/*
- * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
- * and 2.2.4.1.5
- */
-
-/* Flags */
-#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001
-
-struct smb2_transport_capabilities_context {
- __le16 ContextType; /* 6 */
- __le16 DataLength;
- __u32 Reserved;
- __le32 Flags;
- __u32 Pad;
-} __packed;
-
-/*
- * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
- * and 2.2.4.1.6
- */
-
-/* RDMA Transform IDs */
-#define SMB2_RDMA_TRANSFORM_NONE 0x0000
-#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
-#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
-
-struct smb2_rdma_transform_capabilities_context {
- __le16 ContextType; /* 7 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 TransformCount;
- __u16 Reserved1;
- __u32 Reserved2;
- __le16 RDMATransformIds[];
-} __packed;
-
-/*
- * For signing capabilities context see MS-SMB2 2.2.3.1.7
- * and 2.2.4.1.7
- */
-
-/* Signing algorithms */
-#define SIGNING_ALG_HMAC_SHA256 0
-#define SIGNING_ALG_AES_CMAC 1
-#define SIGNING_ALG_AES_GMAC 2
-
-struct smb2_signing_capabilities {
- __le16 ContextType; /* 8 */
- __le16 DataLength;
- __u32 Reserved;
- __le16 SigningAlgorithmCount;
- __le16 SigningAlgorithms[];
- /* Followed by padding to 8 byte boundary (required by some servers) */
-} __packed;
-
-#define POSIX_CTXT_DATA_LEN 16
-struct smb2_posix_neg_context {
- __le16 ContextType; /* 0x100 */
- __le16 DataLength;
- __le32 Reserved;
- __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
-} __packed;
-
-struct smb2_negotiate_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 65 */
- __le16 SecurityMode;
- __le16 DialectRevision;
- __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
- __u8 ServerGUID[16];
- __le32 Capabilities;
- __le32 MaxTransactSize;
- __le32 MaxReadSize;
- __le32 MaxWriteSize;
- __le64 SystemTime; /* MBZ */
- __le64 ServerStartTime;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Flags */
-#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
-#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
-
-struct smb2_sess_setup_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 25 */
- __u8 Flags;
- __u8 SecurityMode;
- __le32 Capabilities;
- __le32 Channel;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u64 PreviousSessionId;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-/* Currently defined SessionFlags */
-#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
-#define SMB2_SESSION_FLAG_IS_NULL 0x0002
-#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
-struct smb2_sess_setup_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 SessionFlags;
- __le16 SecurityBufferOffset;
- __le16 SecurityBufferLength;
- __u8 Buffer[1]; /* variable length GSS security buffer */
-} __packed;
-
-struct smb2_logoff_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_logoff_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-/* Flags/Reserved for SMB3.1.1 */
-#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
-#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
-#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
-
-struct smb2_tree_connect_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */
- __le16 PathOffset;
- __le16 PathLength;
- __u8 Buffer[1]; /* variable length */
-} __packed;
-
-/* See MS-SMB2 section 2.2.9.2 */
-/* Context Types */
-#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
-#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
-
-struct tree_connect_contexts {
- __le16 ContextType;
- __le16 DataLength;
- __le32 Reserved;
- __u8 Data[];
-} __packed;
-
-/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
-struct smb3_blob_data {
- __le16 BlobSize;
- __u8 BlobData[];
-} __packed;
-
-/* Valid values for Attr */
-#define SE_GROUP_MANDATORY 0x00000001
-#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002
-#define SE_GROUP_ENABLED 0x00000004
-#define SE_GROUP_OWNER 0x00000008
-#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010
-#define SE_GROUP_INTEGRITY 0x00000020
-#define SE_GROUP_INTEGRITY_ENABLED 0x00000040
-#define SE_GROUP_RESOURCE 0x20000000
-#define SE_GROUP_LOGON_ID 0xC0000000
-
-/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
-
-struct sid_array_data {
- __le16 SidAttrCount;
- /* SidAttrList - array of sid_attr_data structs */
-} __packed;
-
-struct luid_attr_data {
-
-} __packed;
-
-/*
- * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
- * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
- */
-
-struct privilege_array_data {
- __le16 PrivilegeCount;
- /* array of privilege_data structs */
-} __packed;
-
-struct remoted_identity_tcon_context {
- __le16 TicketType; /* must be 0x0001 */
- __le16 TicketSize; /* total size of this struct */
- __le16 User; /* offset to SID_ATTR_DATA struct with user info */
- __le16 UserName; /* offset to null terminated Unicode username string */
- __le16 Domain; /* offset to null terminated Unicode domain name */
- __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
- __le16 RestrictedGroups; /* similar to above */
- __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
- __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
- __le16 Owner; /* offset to BLOB_DATA struct */
- __le16 DefaultDacl; /* offset to BLOB_DATA struct */
- __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
- __le16 UserClaims; /* offset to BLOB_DATA struct */
- __le16 DeviceClaims; /* offset to BLOB_DATA struct */
- __u8 TicketInfo[]; /* variable length buf - remoted identity data */
-} __packed;
-
-struct smb2_tree_connect_req_extension {
- __le32 TreeConnectContextOffset;
- __le16 TreeConnectContextCount;
- __u8 Reserved[10];
- __u8 PathName[]; /* variable sized array */
- /* followed by array of TreeConnectContexts */
-} __packed;
-
-struct smb2_tree_connect_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 16 */
- __u8 ShareType; /* see below */
- __u8 Reserved;
- __le32 ShareFlags; /* see below */
- __le32 Capabilities; /* see below */
- __le32 MaximalAccess;
-} __packed;
-
-/* Possible ShareType values */
-#define SMB2_SHARE_TYPE_DISK 0x01
-#define SMB2_SHARE_TYPE_PIPE 0x02
-#define SMB2_SHARE_TYPE_PRINT 0x03
-
-/*
- * Possible ShareFlags - exactly one and only one of the first 4 caching flags
- * must be set (any of the remaining, SHI1005, flags may be set individually
- * or in combination.
- */
-#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
-#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
-#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
-#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
-#define SHI1005_FLAGS_DFS 0x00000001
-#define SHI1005_FLAGS_DFS_ROOT 0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
-#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
-#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
-#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
-#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL 0x0014FF33
-
-/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
-#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
-#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
-#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
-#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
-#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
-
-struct smb2_tree_disconnect_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-struct smb2_tree_disconnect_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 4 */
- __le16 Reserved;
-} __packed;
-
-/* File Attrubutes */
-#define FILE_ATTRIBUTE_READONLY 0x00000001
-#define FILE_ATTRIBUTE_HIDDEN 0x00000002
-#define FILE_ATTRIBUTE_SYSTEM 0x00000004
-#define FILE_ATTRIBUTE_DIRECTORY 0x00000010
-#define FILE_ATTRIBUTE_ARCHIVE 0x00000020
-#define FILE_ATTRIBUTE_NORMAL 0x00000080
-#define FILE_ATTRIBUTE_TEMPORARY 0x00000100
-#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200
-#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400
-#define FILE_ATTRIBUTE_COMPRESSED 0x00000800
-#define FILE_ATTRIBUTE_OFFLINE 0x00001000
-#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
-#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
-#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
-#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
-
-/* Oplock levels */
-#define SMB2_OPLOCK_LEVEL_NONE 0x00
-#define SMB2_OPLOCK_LEVEL_II 0x01
-#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
-#define SMB2_OPLOCK_LEVEL_BATCH 0x09
-#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
-/* Non-spec internal type */
-#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
-
-/* Desired Access Flags */
-#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
-#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
-#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
-#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
-#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
-#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
-#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
-#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
-#define FILE_DELETE_LE cpu_to_le32(0x00010000)
-#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
-#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
-#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
-#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
-#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
-#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
-#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
-#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
-#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
-#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
-
-/* ShareAccess Flags */
-#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
-#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
-#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
-#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
-
-/* CreateDisposition Flags */
-#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
-#define FILE_OPEN_LE cpu_to_le32(0x00000001)
-#define FILE_CREATE_LE cpu_to_le32(0x00000002)
-#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
-#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
-#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
-
-/* CreateOptions Flags */
-#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
-/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */
-#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
-#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
-#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008)
-#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010)
-#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020)
-#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
-#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
-#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
-#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
-#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
-#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
-#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
-#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000)
-#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
-#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
-#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
-
-#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
- | FILE_READ_ATTRIBUTES_LE)
-#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
- | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
-#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
-
-/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
-#define IL_ANONYMOUS cpu_to_le32(0x00000000)
-#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
-#define IL_IMPERSONATION cpu_to_le32(0x00000002)
-#define IL_DELEGATE cpu_to_le32(0x00000003)
-
-/* Create Context Values */
-#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
-#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
-#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
-#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
-#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
-#define SMB2_CREATE_REQUEST_LEASE "RqLs"
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
-#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
-#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010
-#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
-#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
-
-/* Flag (SMB3 open response) values */
-#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
-
/*
* Maximum number of iovs we need for an open/create request.
* [0] : struct smb2_create_req
@@ -807,26 +128,6 @@ struct smb2_tree_disconnect_rsp {
*/
#define SMB2_CREATE_IOV_SIZE 8
-struct smb2_create_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 57 */
- __u8 SecurityFlags;
- __u8 RequestedOplockLevel;
- __le32 ImpersonationLevel;
- __le64 SmbCreateFlags;
- __le64 Reserved;
- __le32 DesiredAccess;
- __le32 FileAttributes;
- __le32 ShareAccess;
- __le32 CreateDisposition;
- __le32 CreateOptions;
- __le16 NameOffset;
- __le16 NameLength;
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[];
-} __packed;
-
/*
* Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
* 88 (fixed part of create response) + 520 (path) + 208 (contexts) +
@@ -834,37 +135,6 @@ struct smb2_create_req {
*/
#define MAX_SMB2_CREATE_RESPONSE_SIZE 880
-struct smb2_create_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 89 */
- __u8 OplockLevel;
- __u8 Flag; /* 0x01 if reparse point */
- __le32 CreateAction;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize;
- __le64 EndofFile;
- __le32 FileAttributes;
- __le32 Reserved2;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 CreateContextsOffset;
- __le32 CreateContextsLength;
- __u8 Buffer[1];
-} __packed;
-
-struct create_context {
- __le32 Next;
- __le16 NameOffset;
- __le16 NameLength;
- __le16 Reserved;
- __le16 DataOffset;
- __le32 DataLength;
- __u8 Buffer[];
-} __packed;
-
#define SMB2_LEASE_READ_CACHING_HE 0x01
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
@@ -1210,7 +480,7 @@ struct duplicate_extents_to_file {
#define SMB2_IOCTL_IOV_SIZE 2
struct smb2_ioctl_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -1228,7 +498,7 @@ struct smb2_ioctl_req {
} __packed;
struct smb2_ioctl_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -1243,161 +513,6 @@ struct smb2_ioctl_rsp {
/* char * buffer[] */
} __packed;
-/* Currently defined values for close flags */
-#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
-struct smb2_close_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Flags;
- __le32 Reserved;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
-} __packed;
-
-/*
- * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
- */
-#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
-
-struct smb2_close_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* 60 */
- __le16 Flags;
- __le32 Reserved;
- __le64 CreationTime;
- __le64 LastAccessTime;
- __le64 LastWriteTime;
- __le64 ChangeTime;
- __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
- __le64 EndOfFile;
- __le32 Attributes;
-} __packed;
-
-struct smb2_flush_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 24 */
- __le16 Reserved1;
- __le32 Reserved2;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
-} __packed;
-
-struct smb2_flush_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Reserved;
-} __packed;
-
-/* For read request Flags field below, following flag is defined for SMB3.02 */
-#define SMB2_READFLAG_READ_UNBUFFERED 0x01
-#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
-
-/* Channel field for read and write: exactly one of following flags can be set*/
-#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
-#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */
-#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */
-
-/* SMB2 read request without RFC1001 length at the beginning */
-struct smb2_read_plain_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 49 */
- __u8 Padding; /* offset from start of SMB2 header to place read */
- __u8 Flags; /* MBZ unless SMB3.02 or later */
- __le32 Length;
- __le64 Offset;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 MinimumCount;
- __le32 Channel; /* MBZ except for SMB3 or later */
- __le32 RemainingBytes;
- __le16 ReadChannelInfoOffset;
- __le16 ReadChannelInfoLength;
- __u8 Buffer[1];
-} __packed;
-
-/* Read flags */
-#define SMB2_READFLAG_RESPONSE_NONE 0x00000000
-#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001
-
-struct smb2_read_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-/* For write request Flags field below the following flags are defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
-#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
-
-struct smb2_write_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 49 */
- __le16 DataOffset; /* offset from start of SMB2 header to write data */
- __le32 Length;
- __le64 Offset;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 Channel; /* MBZ unless SMB3.02 or later */
- __le32 RemainingBytes;
- __le16 WriteChannelInfoOffset;
- __le16 WriteChannelInfoLength;
- __le32 Flags;
- __u8 Buffer[1];
-} __packed;
-
-struct smb2_write_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 17 */
- __u8 DataOffset;
- __u8 Reserved;
- __le32 DataLength;
- __le32 DataRemaining;
- __u32 Reserved2;
- __u8 Buffer[1];
-} __packed;
-
-/* notify flags */
-#define SMB2_WATCH_TREE 0x0001
-
-/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
-#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
-#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
-#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
-#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
-#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
-#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
-#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
-#define FILE_NOTIFY_CHANGE_EA 0x00000080
-#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
-#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
-#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
-#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
-
-struct smb2_change_notify_req {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize;
- __le16 Flags;
- __le32 OutputBufferLength;
- __u64 PersistentFileId; /* opaque endianness */
- __u64 VolatileFileId; /* opaque endianness */
- __le32 CompletionFilter;
- __u32 Reserved;
-} __packed;
-
-struct smb2_change_notify_rsp {
- struct smb2_sync_hdr sync_hdr;
- __le16 StructureSize; /* Must be 9 */
- __le16 OutputBufferOffset;
- __le32 OutputBufferLength;
- __u8 Buffer[1]; /* array of file notify structs */
-} __packed;
-
#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001
#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002
#define SMB2_LOCKFLAG_UNLOCK 0x0004
@@ -1411,7 +526,7 @@ struct smb2_lock_element {
} __packed;
struct smb2_lock_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 48 */
__le16 LockCount;
/*
@@ -1426,19 +541,19 @@ struct smb2_lock_req {
} __packed;
struct smb2_lock_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
struct smb2_echo_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
struct smb2_echo_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
@@ -1468,7 +583,7 @@ struct smb2_echo_rsp {
*/
struct smb2_query_directory_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 33 */
__u8 FileInformationClass;
__u8 Flags;
@@ -1482,7 +597,7 @@ struct smb2_query_directory_req {
} __packed;
struct smb2_query_directory_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1515,7 +630,7 @@ struct smb2_query_directory_rsp {
#define SL_INDEX_SPECIFIED 0x00000004
struct smb2_query_info_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 41 */
__u8 InfoType;
__u8 FileInfoClass;
@@ -1531,7 +646,7 @@ struct smb2_query_info_req {
} __packed;
struct smb2_query_info_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1548,7 +663,7 @@ struct smb2_query_info_rsp {
#define SMB2_SET_INFO_IOV_SIZE 3
struct smb2_set_info_req {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 33 */
__u8 InfoType;
__u8 FileInfoClass;
@@ -1562,12 +677,12 @@ struct smb2_set_info_req {
} __packed;
struct smb2_set_info_rsp {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 2 */
} __packed;
struct smb2_oplock_break {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 24 */
__u8 OplockLevel;
__u8 Reserved;
@@ -1579,7 +694,7 @@ struct smb2_oplock_break {
#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
struct smb2_lease_break {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 44 */
__le16 Epoch;
__le32 Flags;
@@ -1592,7 +707,7 @@ struct smb2_lease_break {
} __packed;
struct smb2_lease_ack {
- struct smb2_sync_hdr sync_hdr;
+ struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 36 */
__le16 Reserved;
__le32 Flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 547945443fa7..096fada16ebd 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -25,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
extern char *smb2_get_data_area_len(int *off, int *len,
- struct smb2_sync_hdr *shdr);
+ struct smb2_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index f59b956f9d25..2bf047b390a9 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -19,7 +19,6 @@
#include <linux/mempool.h>
#include <linux/highmem.h>
#include <crypto/aead.h>
-#include "smb2pdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "smb2proto.h"
@@ -213,14 +212,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
unsigned char *sigptr = smb2_signature;
struct kvec *iov = rqst->rq_iov;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct cifs_ses *ses;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
- ses = smb2_find_smb_ses(server, shdr->SessionId);
+ ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
if (!ses) {
cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
return 0;
@@ -534,14 +533,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
unsigned char smb3_signature[SMB2_CMACAES_SIZE];
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
struct shash_desc *shash;
struct crypto_shash *hash;
struct sdesc *sdesc = NULL;
struct smb_rqst drqst;
u8 key[SMB3_SIGN_KEY_SIZE];
- rc = smb2_get_sign_key(shdr->SessionId, server, key);
+ rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
if (rc)
return 0;
@@ -611,12 +610,12 @@ static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
int rc = 0;
- struct smb2_sync_hdr *shdr;
+ struct smb2_hdr *shdr;
struct smb2_sess_setup_req *ssr;
bool is_binding;
bool is_signed;
- shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
ssr = (struct smb2_sess_setup_req *)shdr;
is_binding = shdr->Command == SMB2_SESSION_SETUP &&
@@ -642,8 +641,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
unsigned int rc;
char server_response_sig[SMB2_SIGNATURE_SIZE];
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
if ((shdr->Command == SMB2_NEGOTIATE) ||
(shdr->Command == SMB2_SESSION_SETUP) ||
@@ -689,7 +688,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
*/
static inline void
smb2_seq_num_into_buf(struct TCP_Server_Info *server,
- struct smb2_sync_hdr *shdr)
+ struct smb2_hdr *shdr)
{
unsigned int i, num = le16_to_cpu(shdr->CreditCharge);
@@ -700,7 +699,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server,
}
static struct mid_q_entry *
-smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
+smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
@@ -732,14 +731,15 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
- trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
- le16_to_cpu(shdr->Command), temp->mid);
+ trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId),
+ le64_to_cpu(shdr->SessionId),
+ le16_to_cpu(shdr->Command), temp->mid);
return temp;
}
static int
smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
- struct smb2_sync_hdr *shdr, struct mid_q_entry **mid)
+ struct smb2_hdr *shdr, struct mid_q_entry **mid)
{
if (server->tcpStatus == CifsExiting)
return -ENOENT;
@@ -807,8 +807,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct smb_rqst *rqst)
{
int rc;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
smb2_seq_num_into_buf(server, shdr);
@@ -833,8 +833,8 @@ struct mid_q_entry *
smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
{
int rc;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_hdr *shdr =
+ (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
if (server->tcpStatus == CifsNeedNegotiate &&
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index dafcb6ab050d..6cecf302dcfd 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -11,6 +11,8 @@
#define _CIFS_TRACE_H
#include <linux/tracepoint.h>
+#include <linux/net.h>
+#include <linux/inet.h>
/*
* Please use this 3-part article as a reference for writing new tracepoints:
@@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DECLARE_EVENT_CLASS(smb3_connect_class,
+ TP_PROTO(char *hostname,
+ __u64 conn_id,
+ const struct __kernel_sockaddr_storage *dst_addr),
+ TP_ARGS(hostname, conn_id, dst_addr),
+ TP_STRUCT__entry(
+ __string(hostname, hostname)
+ __field(__u64, conn_id)
+ __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+ ),
+ TP_fast_assign(
+ struct sockaddr_storage *pss = NULL;
+
+ __entry->conn_id = conn_id;
+ pss = (struct sockaddr_storage *)__entry->dst_addr;
+ *pss = *dst_addr;
+ __assign_str(hostname, hostname);
+ ),
+ TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
+ __entry->conn_id,
+ __get_str(hostname),
+ __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_EVENT(name) \
+DEFINE_EVENT(smb3_connect_class, smb3_##name, \
+ TP_PROTO(char *hostname, \
+ __u64 conn_id, \
+ const struct __kernel_sockaddr_storage *addr), \
+ TP_ARGS(hostname, conn_id, addr))
+
+DEFINE_SMB3_CONNECT_EVENT(connect_done);
+
+DECLARE_EVENT_CLASS(smb3_connect_err_class,
+ TP_PROTO(char *hostname, __u64 conn_id,
+ const struct __kernel_sockaddr_storage *dst_addr, int rc),
+ TP_ARGS(hostname, conn_id, dst_addr, rc),
+ TP_STRUCT__entry(
+ __string(hostname, hostname)
+ __field(__u64, conn_id)
+ __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ struct sockaddr_storage *pss = NULL;
+
+ __entry->conn_id = conn_id;
+ __entry->rc = rc;
+ pss = (struct sockaddr_storage *)__entry->dst_addr;
+ *pss = *dst_addr;
+ __assign_str(hostname, hostname);
+ ),
+ TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
+ __entry->rc,
+ __entry->conn_id,
+ __get_str(hostname),
+ __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \
+ TP_PROTO(char *hostname, \
+ __u64 conn_id, \
+ const struct __kernel_sockaddr_storage *addr, \
+ int rc), \
+ TP_ARGS(hostname, conn_id, addr, rc))
+
+DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err);
+
DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_PROTO(__u64 currmid,
__u64 conn_id,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 06855f6c7902..62a3d2565c26 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -63,9 +63,10 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
struct inode *inode;
struct coda_inode_info *cii;
unsigned long hash = coda_f2i(fid);
+ umode_t inode_type = coda_inode_type(attr);
+retry:
inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -75,11 +76,15 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
inode->i_ino = hash;
/* inode is locked and unique, no need to grab cii->c_lock */
cii->c_mapcount = 0;
+ coda_fill_inode(inode, attr);
unlock_new_inode(inode);
+ } else if ((inode->i_mode & S_IFMT) != inode_type) {
+ /* Inode has changed type, mark bad and grab a new one */
+ remove_inode_hash(inode);
+ coda_flag_inode(inode, C_PURGE);
+ iput(inode);
+ goto retry;
}
-
- /* always replace the attributes, type might have changed */
- coda_fill_inode(inode, attr);
return inode;
}
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2e1a5a192074..903ca8fa4b9b 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -87,28 +87,27 @@ static struct coda_timespec timespec64_to_coda(struct timespec64 ts64)
}
/* utility functions below */
+umode_t coda_inode_type(struct coda_vattr *attr)
+{
+ switch (attr->va_type) {
+ case C_VREG:
+ return S_IFREG;
+ case C_VDIR:
+ return S_IFDIR;
+ case C_VLNK:
+ return S_IFLNK;
+ case C_VNON:
+ default:
+ return 0;
+ }
+}
+
void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
{
- int inode_type;
- /* inode's i_flags, i_ino are set by iget
- XXX: is this all we need ??
- */
- switch (attr->va_type) {
- case C_VNON:
- inode_type = 0;
- break;
- case C_VREG:
- inode_type = S_IFREG;
- break;
- case C_VDIR:
- inode_type = S_IFDIR;
- break;
- case C_VLNK:
- inode_type = S_IFLNK;
- break;
- default:
- inode_type = 0;
- }
+ /* inode's i_flags, i_ino are set by iget
+ * XXX: is this all we need ??
+ */
+ umode_t inode_type = coda_inode_type(attr);
inode->i_mode |= inode_type;
if (attr->va_mode != (u_short) -1)
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index e7b27754ce78..9be281bbcc06 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -53,10 +53,11 @@ int coda_getattr(struct user_namespace *, const struct path *, struct kstat *,
u32, unsigned int);
int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *);
-/* this file: heloers */
+/* this file: helpers */
char *coda_f2s(struct CodaFid *f);
int coda_iscontrol(const char *name, size_t length);
+umode_t coda_inode_type(struct coda_vattr *attr);
void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
unsigned short coda_flags_to_cflags(unsigned short);
@@ -83,6 +84,9 @@ static __inline__ void coda_flag_inode(struct inode *inode, int flag)
{
struct coda_inode_info *cii = ITOC(inode);
+ if (!inode)
+ return;
+
spin_lock(&cii->c_lock);
cii->c_flags |= flag;
spin_unlock(&cii->c_lock);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index d69989c1bac3..328d7a684b63 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -317,13 +317,10 @@ static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
coda_dir_drop_nlink(old_dir);
coda_dir_inc_nlink(new_dir);
}
- coda_dir_update_mtime(old_dir);
- coda_dir_update_mtime(new_dir);
coda_flag_inode(d_inode(new_dentry), C_VATTR);
- } else {
- coda_flag_inode(old_dir, C_VATTR);
- coda_flag_inode(new_dir, C_VATTR);
}
+ coda_dir_update_mtime(old_dir);
+ coda_dir_update_mtime(new_dir);
}
return error;
}
@@ -499,15 +496,20 @@ out:
*/
static int coda_dentry_delete(const struct dentry * dentry)
{
- int flags;
+ struct inode *inode;
+ struct coda_inode_info *cii;
if (d_really_is_negative(dentry))
return 0;
- flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE;
- if (is_bad_inode(d_inode(dentry)) || flags) {
+ inode = d_inode(dentry);
+ if (!inode || is_bad_inode(inode))
return 1;
- }
+
+ cii = ITOC(inode);
+ if (cii->c_flags & C_PURGE)
+ return 1;
+
return 0;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ef5ca22bfb3e..29dd87be2fb8 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -8,6 +8,7 @@
* to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
*/
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/time.h>
@@ -28,7 +29,7 @@
#include "coda_int.h"
struct coda_vm_ops {
- atomic_t refcnt;
+ refcount_t refcnt;
struct file *coda_file;
const struct vm_operations_struct *host_vm_ops;
struct vm_operations_struct vm_ops;
@@ -98,7 +99,7 @@ coda_vm_open(struct vm_area_struct *vma)
struct coda_vm_ops *cvm_ops =
container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
- atomic_inc(&cvm_ops->refcnt);
+ refcount_inc(&cvm_ops->refcnt);
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
cvm_ops->host_vm_ops->open(vma);
@@ -113,7 +114,7 @@ coda_vm_close(struct vm_area_struct *vma)
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
cvm_ops->host_vm_ops->close(vma);
- if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+ if (refcount_dec_and_test(&cvm_ops->refcnt)) {
vma->vm_ops = cvm_ops->host_vm_ops;
fput(cvm_ops->coda_file);
kfree(cvm_ops);
@@ -189,7 +190,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
cvm_ops->vm_ops.open = coda_vm_open;
cvm_ops->vm_ops.close = coda_vm_close;
cvm_ops->coda_file = coda_file;
- atomic_set(&cvm_ops->refcnt, 1);
+ refcount_set(&cvm_ops->refcnt, 1);
vma->vm_ops = &cvm_ops->vm_ops;
}
@@ -238,11 +239,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct inode *host_inode;
- int err;
cfi = coda_ftoc(coda_file);
- err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+ venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
coda_flags, coda_file->f_cred->fsuid);
host_inode = file_inode(cfi->cfi_container);
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 240669f51eac..b39580ad4ce5 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -122,14 +122,10 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
hdr.opcode, hdr.unique);
nbytes = size;
}
- dcbuf = kvmalloc(nbytes, GFP_KERNEL);
- if (!dcbuf) {
- retval = -ENOMEM;
- goto out;
- }
- if (copy_from_user(dcbuf, buf, nbytes)) {
- kvfree(dcbuf);
- retval = -EFAULT;
+
+ dcbuf = vmemdup_user(buf, nbytes);
+ if (IS_ERR(dcbuf)) {
+ retval = PTR_ERR(dcbuf);
goto out;
}
@@ -388,7 +384,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
MODULE_LICENSE("GPL");
-MODULE_VERSION("7.0");
+MODULE_VERSION("7.2");
static int __init init_coda(void)
{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index eb3b1898da46..59f6cfd06f96 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -744,7 +744,8 @@ static int coda_upcall(struct venus_comm *vcp,
list_add_tail(&req->uc_chain, &vcp->vc_pending);
wake_up_interruptible(&vcp->vc_waitq);
- if (req->uc_flags & CODA_REQ_ASYNC) {
+ /* We can return early on asynchronous requests */
+ if (outSize == NULL) {
mutex_unlock(&vcp->vc_mutex);
return 0;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 3224dee44d30..a6b3c196cdef 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -359,7 +359,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- if (t != current && t->mm) {
+ if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
@@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
return nr;
}
-static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+static int zap_threads(struct task_struct *tsk,
struct core_state *core_state, int exit_code)
{
- struct task_struct *g, *p;
- unsigned long flags;
int nr = -EAGAIN;
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
- mm->core_state = core_state;
+ tsk->signal->core_state = core_state;
tsk->signal->group_exit_task = tsk;
nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ tsk->flags |= PF_DUMPCORE;
+ atomic_set(&core_state->nr_threads, nr);
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (unlikely(nr < 0))
- return nr;
-
- tsk->flags |= PF_DUMPCORE;
- if (atomic_read(&mm->mm_users) == nr + 1)
- goto done;
- /*
- * We should find and kill all tasks which use this mm, and we should
- * count them correctly into ->nr_threads. We don't take tasklist
- * lock, but this is safe wrt:
- *
- * fork:
- * None of sub-threads can fork after zap_process(leader). All
- * processes which were created before this point should be
- * visible to zap_threads() because copy_process() adds the new
- * process to the tail of init_task.tasks list, and lock/unlock
- * of ->siglock provides a memory barrier.
- *
- * do_exit:
- * The caller holds mm->mmap_lock. This means that the task which
- * uses this mm can't pass exit_mm(), so it can't exit or clear
- * its ->mm.
- *
- * de_thread:
- * It does list_replace_rcu(&leader->tasks, &current->tasks),
- * we must see either old or new leader, this does not matter.
- * However, it can change p->sighand, so lock_task_sighand(p)
- * must be used. Since p->mm != NULL and we hold ->mmap_lock
- * it can't fail.
- *
- * Note also that "g" can be the old leader with ->mm == NULL
- * and already unhashed and thus removed from ->thread_group.
- * This is OK, __unhash_process()->list_del_rcu() does not
- * clear the ->next pointer, we will find the new leader via
- * next_thread().
- */
- rcu_read_lock();
- for_each_process(g) {
- if (g == tsk->group_leader)
- continue;
- if (g->flags & PF_KTHREAD)
- continue;
-
- for_each_thread(g, p) {
- if (unlikely(!p->mm))
- continue;
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code,
- SIGNAL_GROUP_EXIT);
- unlock_task_sighand(p, &flags);
- }
- break;
- }
- }
- rcu_read_unlock();
-done:
- atomic_set(&core_state->nr_threads, nr);
return nr;
}
static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- if (!mm->core_state)
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- mmap_write_unlock(mm);
-
+ core_waiters = zap_threads(tsk, core_state, exit_code);
if (core_waiters > 0) {
struct core_thread *ptr;
@@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
return core_waiters;
}
-static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+static void coredump_finish(bool core_dumped)
{
struct core_thread *curr, *next;
struct task_struct *task;
@@ -493,22 +428,21 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
current->signal->group_exit_code |= 0x80;
current->signal->group_exit_task = NULL;
current->signal->flags = SIGNAL_GROUP_EXIT;
+ next = current->signal->core_state->dumper.next;
+ current->signal->core_state = NULL;
spin_unlock_irq(&current->sighand->siglock);
- next = mm->core_state->dumper.next;
while ((curr = next) != NULL) {
next = curr->next;
task = curr->task;
/*
- * see exit_mm(), curr->task must not see
+ * see coredump_task_exit(), curr->task must not see
* ->task == NULL before we read ->next.
*/
smp_mb();
curr->task = NULL;
wake_up_process(task);
}
-
- mm->core_state = NULL;
}
static bool dump_interrupted(void)
@@ -839,7 +773,7 @@ fail_dropcount:
fail_unlock:
kfree(argv);
kfree(cn.corename);
- coredump_finish(mm, core_dumped);
+ coredump_finish(core_dumped);
revert_creds(old_cred);
fail_creds:
put_cred(cred);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 2be65269a987..666aa380011e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
return read_buffers[i] + blk_offset;
}
- devsize = mapping->host->i_size >> PAGE_SHIFT;
+ devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
for (i = 0; i < BLKS_PER_BUF; i++) {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 68a2de6b5a9b..bfc2a5b74ed3 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -1,23 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This contains encryption functions for per-file encryption.
+ * Utility functions for file contents encryption/decryption on
+ * block device-based filesystems.
*
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- * Add fscrypt_pullback_bio_page()
- * Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
*/
#include <linux/pagemap.h>
@@ -26,6 +13,21 @@
#include <linux/namei.h>
#include "fscrypt_private.h"
+/**
+ * fscrypt_decrypt_bio() - decrypt the contents of a bio
+ * @bio: the bio to decrypt
+ *
+ * Decrypt the contents of a "read" bio following successful completion of the
+ * underlying disk read. The bio must be reading a whole number of blocks of an
+ * encrypted file directly into the page cache. If the bio is reading the
+ * ciphertext into bounce pages instead of the page cache (for example, because
+ * the file is also compressed, so decompression is required after decryption),
+ * then this function isn't applicable. This function may sleep, so it must be
+ * called from a workqueue rather than from the bio's bi_end_io callback.
+ *
+ * This function sets PG_error on any pages that contain any blocks that failed
+ * to be decrypted. The filesystem must not mark such pages uptodate.
+ */
void fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index eb538c28df94..a9be4bc74a94 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (fscrypt_has_encryption_key(dir)) {
if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len,
- dir->i_sb->s_cop->max_namelen,
+ iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 3fa965eb3336..5b0a9e6478b5 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -20,6 +20,11 @@
#define FSCRYPT_FILE_NONCE_SIZE 16
+/*
+ * Minimum size of an fscrypt master key. Note: a longer key will be required
+ * if ciphers with a 256-bit security strength are used. This is just the
+ * absolute minimum, which applies when only 128-bit encryption is used.
+ */
#define FSCRYPT_MIN_KEY_SIZE 16
#define FSCRYPT_CONTEXT_V1 1
@@ -413,7 +418,11 @@ struct fscrypt_master_key_secret {
*/
struct fscrypt_hkdf hkdf;
- /* Size of the raw key in bytes. Set even if ->raw isn't set. */
+ /*
+ * Size of the raw key in bytes. This remains set even if ->raw was
+ * zeroized due to no longer being needed. I.e. we still remember the
+ * size of the key even if we don't need to remember the key itself.
+ */
u32 size;
/* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
@@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void);
struct fscrypt_mode {
const char *friendly_name;
const char *cipher_str;
- int keysize;
- int ivsize;
+ int keysize; /* key size in bytes */
+ int security_strength; /* security strength in bytes */
+ int ivsize; /* IV size in bytes */
int logged_impl_name;
enum blk_crypto_mode_num blk_crypto_mode;
};
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index e0ec21055505..7607d18b35fc 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -16,9 +16,14 @@
/*
* HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses
- * SHA-512 because it is reasonably secure and efficient; and since it produces
- * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of
- * entropy from the master key and requires only one iteration of HKDF-Expand.
+ * SHA-512 because it is well-established, secure, and reasonably efficient.
+ *
+ * HKDF-SHA256 was also considered, as its 256-bit security strength would be
+ * sufficient here. A 512-bit security strength is "nice to have", though.
+ * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the
+ * common case of deriving an AES-256-XTS key (512 bits), that can result in
+ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
+ * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
*/
#define HKDF_HMAC_ALG "hmac(sha512)"
#define HKDF_HASHLEN SHA512_DIGEST_SIZE
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index bca9c6658a7c..eede186b04ce 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
},
@@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-256-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 16,
},
[FSCRYPT_MODE_AES_128_CBC] = {
.friendly_name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
},
@@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = {
.friendly_name = "AES-128-CTS-CBC",
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
@@ -117,8 +122,9 @@ err_free_tfm:
/*
* Prepare the crypto transform object or blk-crypto key in @prep_key, given the
- * raw key, encryption mode, and flag indicating which encryption implementation
- * (fs-layer or blk-crypto) will be used.
+ * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
+ * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
+ * and IV generation method (@ci->ci_policy.flags).
*/
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_info *ci)
@@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
}
/*
+ * Check whether the size of the given master key (@mk) is appropriate for the
+ * encryption settings which a particular file will use (@ci).
+ *
+ * If the file uses a v1 encryption policy, then the master key must be at least
+ * as long as the derived key, as this is a requirement of the v1 KDF.
+ *
+ * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
+ * requirement: we require that the size of the master key be at least the
+ * maximum security strength of any algorithm whose key will be derived from it
+ * (but in practice we only need to consider @ci->ci_mode, since any other
+ * possible subkeys such as DIRHASH and INODE_HASH will never increase the
+ * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be
+ * derived from a 256-bit master key, which is cryptographically sufficient,
+ * rather than requiring a 512-bit master key which is unnecessarily long. (We
+ * still allow 512-bit master keys if the user chooses to use them, though.)
+ */
+static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
+ const struct fscrypt_info *ci)
+{
+ unsigned int min_keysize;
+
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
+ min_keysize = ci->ci_mode->keysize;
+ else
+ min_keysize = ci->ci_mode->security_strength;
+
+ if (mk->mk_secret.size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
+ master_key_spec_type(&mk->mk_spec),
+ master_key_spec_len(&mk->mk_spec),
+ (u8 *)&mk->mk_spec.u,
+ mk->mk_secret.size, min_keysize);
+ return false;
+ }
+ return true;
+}
+
+/*
* Find the master key, then set up the inode's actual encryption key.
*
* If the master key is found in the filesystem-level keyring, then the
@@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
goto out_release_key;
}
- /*
- * Require that the master key be at least as long as the derived key.
- * Otherwise, the derived key cannot possibly contain as much entropy as
- * that required by the encryption mode it will be used for. For v1
- * policies it's also required for the KDF to work at all.
- */
- if (mk->mk_secret.size < ci->ci_mode->keysize) {
- fscrypt_warn(NULL,
- "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
- master_key_spec_type(&mk_spec),
- master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u,
- mk->mk_secret.size, ci->ci_mode->keysize);
+ if (!fscrypt_valid_master_key_size(mk, ci)) {
err = -ENOKEY;
goto out_release_key;
}
diff --git a/fs/d_path.c b/fs/d_path.c
index cd60c7535181..e4e0ebad1f15 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
/**
* prepend_name - prepend a pathname in front of current buffer pointer
- * @buffer: buffer pointer
- * @buflen: allocated length of the buffer
- * @name: name string and length qstr structure
+ * @p: prepend buffer which contains buffer pointer and allocated length
+ * @name: name string and length qstr structure
*
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
* make sure that either the old or the new name pointer and length are
@@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
* prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry
- * @buffer: pointer to the end of the buffer
- * @buflen: pointer to buffer length
+ * @p: prepend buffer which contains buffer pointer and allocated length
*
* The function will first try to write out the pathname without taking any
* lock other than the RCU read lock to make sure that dentries won't go away.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b2e86e739d7a..654443558047 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,7 +119,6 @@ struct dio {
int flags; /* doesn't change */
int op;
int op_flags;
- blk_qc_t bio_cookie;
struct gendisk *bio_disk;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
@@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
if (ret > 0 && dio->op == REQ_OP_WRITE)
ret = generic_write_sync(dio->iocb, ret);
- dio->iocb->ki_complete(dio->iocb, ret, 0);
+ dio->iocb->ki_complete(dio->iocb, ret);
}
kmem_cache_free(dio_cache, dio);
@@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->bio_disk = bio->bi_bdev->bd_disk;
- if (sdio->submit_io) {
+ if (sdio->submit_io)
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
- dio->bio_cookie = BLK_QC_T_NONE;
- } else
- dio->bio_cookie = submit_bio(bio);
+ else
+ submit_bio(bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
- blk_io_schedule();
+ blk_io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
} else {
dio->op = REQ_OP_READ;
}
- if (iocb->ki_flags & IOCB_HIPRI)
- dio->op_flags |= REQ_HIPRI;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 14b747026742..f57255ab88ed 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -6,16 +6,22 @@ config EROFS_FS
select FS_IOMAP
select LIBCRC32C
help
- EROFS (Enhanced Read-Only File System) is a lightweight
- read-only file system with modern designs (eg. page-sized
- blocks, inline xattrs/data, etc.) for scenarios which need
- high-performance read-only requirements, e.g. Android OS
- for mobile phones and LIVECDs.
+ EROFS (Enhanced Read-Only File System) is a lightweight read-only
+ file system with modern designs (e.g. no buffer heads, inline
+ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+ scenarios which need high-performance read-only solutions, e.g.
+ smartphones with Android OS, LiveCDs and high-density hosts with
+ numerous containers;
- It also provides fixed-sized output compression support,
- which improves storage density, keeps relatively higher
- compression ratios, which is more useful to achieve high
- performance for embedded devices with limited memory.
+ It also provides fixed-sized output compression support in order to
+ improve storage density as well as keep relatively higher compression
+ ratios and implements in-place decompression to reuse the file page
+ for compressed data temporarily with proper strategies, which is
+ quite useful to ensure guaranteed end-to-end runtime decompression
+ performance under extremely memory pressure without extra cost.
+
+ See the documentation at <file:Documentation/filesystems/erofs.rst>
+ for more details.
If unsure, say N.
@@ -76,3 +82,19 @@ config EROFS_FS_ZIP
Enable fixed-sized output compression for EROFS.
If you don't want to enable compression feature, say N.
+
+config EROFS_FS_ZIP_LZMA
+ bool "EROFS LZMA compressed data support"
+ depends on EROFS_FS_ZIP
+ select XZ_DEC
+ select XZ_DEC_MICROLZMA
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing LZMA compressed data, specifically called microLZMA. it
+ gives better compression ratios than the LZ4 algorithm, at the
+ expense of more CPU overhead.
+
+ LZMA support is an experimental feature for now and so most file
+ systems will be readable without selecting this option.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 1f9aced49070..756fe2d65272 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 3701c72bacb2..579406504919 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -8,11 +8,6 @@
#include "internal.h"
-enum {
- Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
- Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
@@ -25,6 +20,12 @@ struct z_erofs_decompress_req {
bool inplace_io, partial_decoding;
};
+struct z_erofs_decompressor {
+ int (*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ char *name;
+};
+
/* some special page->private (unsigned long, see below) */
#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
@@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page)
return true;
}
-static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
struct page *page)
{
if (!z_erofs_is_shortlived_page(page))
@@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
put_page(page);
} else {
/* follow the pcluster rule above. */
- set_page_private(page, 0);
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
}
return true;
}
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
+ struct page **pagepool);
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 9db829715652..0e35ef3f9f3d 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode,
erofs_off_t pos;
int err = 0;
+ map->m_deviceid = 0;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
@@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode,
map->m_flags = 0;
break;
default:
- /* only one device is supported for now */
- if (idx->device_id) {
- erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
- le16_to_cpu(idx->device_id),
- chunknr, vi->nid);
- err = -EFSCORRUPTED;
- goto out_unlock;
- }
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
break;
@@ -155,11 +150,55 @@ out:
return err;
}
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+ struct erofs_device_info *dif;
+ int id;
+
+ /* primary device by default */
+ map->m_bdev = sb->s_bdev;
+ map->m_daxdev = EROFS_SB(sb)->dax_dev;
+
+ if (map->m_deviceid) {
+ down_read(&devs->rwsem);
+ dif = idr_find(&devs->tree, map->m_deviceid - 1);
+ if (!dif) {
+ up_read(&devs->rwsem);
+ return -ENODEV;
+ }
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ up_read(&devs->rwsem);
+ } else if (devs->extra_devices) {
+ down_read(&devs->rwsem);
+ idr_for_each_entry(&devs->tree, dif, id) {
+ erofs_off_t startoff, length;
+
+ if (!dif->mapped_blkaddr)
+ continue;
+ startoff = blknr_to_addr(dif->mapped_blkaddr);
+ length = blknr_to_addr(dif->blocks);
+
+ if (map->m_pa >= startoff &&
+ map->m_pa < startoff + length) {
+ map->m_pa -= startoff;
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ break;
+ }
+ }
+ up_read(&devs->rwsem);
+ }
+ return 0;
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
map.m_la = offset;
map.m_llen = length;
@@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(inode->i_sb, &mdev);
+ if (ret)
+ return ret;
+
+ iomap->bdev = mdev.m_bdev;
+ iomap->dax_dev = mdev.m_daxdev;
iomap->offset = map.m_la;
iomap->length = map.m_llen;
iomap->flags = 0;
@@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->type = IOMAP_INLINE;
ipage = erofs_get_meta_page(inode->i_sb,
- erofs_blknr(map.m_pa));
+ erofs_blknr(mdev.m_pa));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
iomap->inline_data = page_address(ipage) +
- erofs_blkoff(map.m_pa);
+ erofs_blkoff(mdev.m_pa);
iomap->private = ipage;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_pa;
+ iomap->addr = mdev.m_pa;
}
return 0;
}
@@ -287,7 +334,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0);
+ NULL, 0, 0);
if (err < 0)
return err;
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index a5bc4b1b7813..bf37fc76b182 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -16,17 +16,6 @@
#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
#endif
-struct z_erofs_decompressor {
- /*
- * if destpages have sparsed pages, fill them with bounce pages.
- * it also check whether destpages indicate continuous physical memory.
- */
- int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
- int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
- char *name;
-};
-
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int size)
@@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb,
return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
}
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nr =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
void *inpage, unsigned int *inputmargin, int *maptype,
bool support_0padding)
{
@@ -189,7 +182,8 @@ docopy:
return src;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
+ u8 *out)
{
unsigned int inputmargin;
u8 *headpage, *src;
@@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
}
rq->inputsize -= inputmargin;
- src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
- support_0padding);
+ src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
+ &maptype, support_0padding);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
ret, rq->inputsize, inputmargin, rq->outputsize);
- WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
16, 1, src + inputmargin, rq->inputsize, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
@@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
+ } else {
+ ret = 0;
}
if (maptype == 0) {
@@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
return ret;
}
-static struct z_erofs_decompressor decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
- .name = "shifted"
- },
- [Z_EROFS_COMPRESSION_LZ4] = {
- .prepare_destpages = z_erofs_lz4_prepare_destpages,
- .decompress = z_erofs_lz4_decompress,
- .name = "lz4"
- },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
- unsigned short pageofs_out,
- unsigned int outputsize)
-{
- const char *end = dst + outputsize;
- const unsigned int righthalf = PAGE_SIZE - pageofs_out;
- const char *cur = dst - pageofs_out;
-
- while (cur < end) {
- struct page *const page = *out++;
-
- if (page) {
- char *buf = kmap_atomic(page);
-
- if (cur >= dst) {
- memcpy(buf, cur, min_t(uint, PAGE_SIZE,
- end - cur));
- } else {
- memcpy(buf + pageofs_out, cur + pageofs_out,
- min_t(uint, righthalf, end - cur));
- }
- kunmap_atomic(buf);
- }
- cur += PAGE_SIZE;
- }
-}
-
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const struct z_erofs_decompressor *alg = decompressors + rq->alg;
unsigned int dst_maptype;
void *dst;
int ret;
- /* two optimized fast paths only for non bigpcluster cases yet */
- if (rq->inputsize <= PAGE_SIZE) {
- if (nrpages_out == 1 && !rq->inplace_io) {
- DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
- dst_maptype = 0;
- goto dstmap_out;
- }
-
- /*
- * For the case of small output size (especially much less
- * than PAGE_SIZE), memcpy the decompressed data rather than
- * compressed data is preferred.
- */
- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
- dst = erofs_get_pcpubuf(1);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- rq->inplace_io = false;
- ret = alg->decompress(rq, dst);
- if (!ret)
- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
- rq->outputsize);
-
- erofs_put_pcpubuf(dst);
- return ret;
- }
+ /* one optimized fast path only for non bigpcluster cases yet */
+ if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
}
/* general decoding path which can be used for all cases */
- ret = alg->prepare_destpages(rq, pagepool);
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
if (ret < 0)
return ret;
if (ret) {
@@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
dst_maptype = 2;
dstmap_out:
- ret = alg->decompress(rq, dst + rq->pageofs_out);
+ ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
if (!dst_maptype)
kunmap_atomic(dst);
@@ -360,8 +294,8 @@ dstmap_out:
return ret;
}
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
return 0;
}
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .decompress = z_erofs_shifted_transform,
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+ [Z_EROFS_COMPRESSION_LZMA] = {
+ .decompress = z_erofs_lzma_decompress,
+ .name = "lzma"
+ },
+#endif
+};
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+ struct page **pagepool)
{
- if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
- return z_erofs_shifted_transform(rq, pagepool);
- return z_erofs_decompress_generic(rq, pagepool);
+ return decompressors[rq->alg].decompress(rq, pagepool);
}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644
index 000000000000..50045510a1f4
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+ struct z_erofs_lzma *next;
+ struct xz_dec_microlzma *state;
+ struct xz_buf buf;
+ u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_lzma_avail_strms) {
+ struct z_erofs_lzma *strm;
+
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ DBG_BUGON(1);
+ return;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ while (strm) {
+ struct z_erofs_lzma *n = strm->next;
+
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ kfree(strm);
+ --z_erofs_lzma_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int z_erofs_lzma_init(void)
+{
+ unsigned int i;
+
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_lzma_nstrms)
+ z_erofs_lzma_nstrms = num_possible_cpus();
+
+ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+ if (!strm) {
+ z_erofs_lzma_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ ++z_erofs_lzma_avail_strms;
+ }
+ return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size)
+{
+ static DEFINE_MUTEX(lzma_resize_mutex);
+ unsigned int dict_size, i;
+ struct z_erofs_lzma *strm, *head = NULL;
+ int err;
+
+ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ if (lzma->format) {
+ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+ le16_to_cpu(lzma->format));
+ return -EINVAL;
+ }
+ dict_size = le32_to_cpu(lzma->dict_size);
+ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+ erofs_err(sb, "unsupported lzma dictionary size %u",
+ dict_size);
+ return -EINVAL;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+ mutex_lock(&lzma_resize_mutex);
+
+ if (z_erofs_lzma_max_dictsize >= dict_size) {
+ mutex_unlock(&lzma_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+ struct z_erofs_lzma *last;
+
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq,
+ READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ for (last = strm; last->next; last = last->next)
+ ++i;
+ last->next = head;
+ head = strm;
+ }
+
+ err = 0;
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ for (strm = head; strm; strm = strm->next) {
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+ if (!strm->state)
+ err = -ENOMEM;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_lzma_lock);
+ DBG_BUGON(z_erofs_lzma_head);
+ z_erofs_lzma_head = head;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ z_erofs_lzma_max_dictsize = dict_size;
+ mutex_unlock(&lzma_resize_mutex);
+ return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ unsigned int inputmargin, inlen, outlen, pageofs;
+ struct z_erofs_lzma *strm;
+ u8 *kin;
+ bool bounced = false;
+ int no, ni, j, err = 0;
+
+ /* 1. get the exact LZMA compressed size */
+ kin = kmap(*rq->in);
+ inputmargin = 0;
+ while (!kin[inputmargin & ~PAGE_MASK])
+ if (!(++inputmargin & ~PAGE_MASK))
+ break;
+
+ if (inputmargin >= PAGE_SIZE) {
+ kunmap(*rq->in);
+ return -EFSCORRUPTED;
+ }
+ rq->inputsize -= inputmargin;
+
+ /* 2. get an available lzma context */
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = strm->next;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ /* 3. multi-call decompress */
+ inlen = rq->inputsize;
+ outlen = rq->outputsize;
+ xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ !rq->partial_decoding);
+ pageofs = rq->pageofs_out;
+ strm->buf.in = kin + inputmargin;
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
+ inlen -= strm->buf.in_size;
+ strm->buf.out = NULL;
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = 0;
+
+ for (ni = 0, no = -1;;) {
+ enum xz_ret xz_err;
+
+ if (strm->buf.out_pos == strm->buf.out_size) {
+ if (strm->buf.out) {
+ kunmap(rq->out[no]);
+ strm->buf.out = NULL;
+ }
+
+ if (++no >= nrpages_out || !outlen) {
+ erofs_err(rq->sb, "decompressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = min_t(u32, outlen,
+ PAGE_SIZE - pageofs);
+ outlen -= strm->buf.out_size;
+ if (rq->out[no])
+ strm->buf.out = kmap(rq->out[no]) + pageofs;
+ pageofs = 0;
+ } else if (strm->buf.in_pos == strm->buf.in_size) {
+ kunmap(rq->in[ni]);
+
+ if (++ni >= nrpages_in || !inlen) {
+ erofs_err(rq->sb, "compressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+ inlen -= strm->buf.in_size;
+ kin = kmap(rq->in[ni]);
+ strm->buf.in = kin;
+ bounced = false;
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Otherwise, Use short-lived pages
+ * from the on-stack pagepool where pages share with the same
+ * request.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+ strm->buf.in = strm->bounce;
+ bounced = true;
+ }
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+ if (xz_err != XZ_OK) {
+ if (xz_err == XZ_STREAM_END && !outlen)
+ break;
+ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ xz_err, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+ if (no < nrpages_out && strm->buf.out)
+ kunmap(rq->in[no]);
+ if (ni < nrpages_in)
+ kunmap(rq->in[ni]);
+ /* 4. push back LZMA stream context to the global list */
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up(&z_erofs_lzma_wq);
+ return err;
+}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b0b23f41abc3..083997a034e5 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -21,14 +21,29 @@
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
#define EROFS_SB_EXTSLOT_SIZE 16
+struct erofs_deviceslot {
+ union {
+ u8 uuid[16]; /* used for device manager later */
+ u8 userdata[64]; /* digest(sha256), etc. */
+ } u;
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
+
/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
@@ -54,7 +69,9 @@ struct erofs_super_block {
/* customized sliding window size instead of 64k by default */
__le16 lz4_max_distance;
} __packed u1;
- __u8 reserved2[42];
+ __le16 extra_devices; /* # of devices besides the primary device */
+ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
+ __u8 reserved2[38];
};
/*
@@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 8-byte inode chunk indexes */
struct erofs_inode_chunk_index {
__le16 advise; /* always 0, don't care for now */
- __le16 device_id; /* back-end storage id, always 0 for now */
+ __le16 device_id; /* back-end storage id (with bits masked) */
__le32 blkaddr; /* start block address of this inode chunk */
};
@@ -247,10 +264,11 @@ struct erofs_inode_chunk_index {
/* available compression algorithm types (for h_algorithmtype) */
enum {
- Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_MAX
};
-#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
/* 14 bytes (+ length field = 16 bytes) */
struct z_erofs_lz4_cfgs {
@@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs {
u8 reserved[10];
} __packed;
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+ __le32 dict_size;
+ __le16 format;
+ u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
@@ -288,35 +315,34 @@ struct z_erofs_map_header {
#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
/*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- * 0 - literal (uncompressed) cluster
- * 1 - compressed cluster (for the head logical cluster)
- * 2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ * 0 - literal (uncompressed) lcluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
*
* In detail,
- * 0 - literal (uncompressed) cluster,
+ * 0 - literal (uncompressed) lcluster,
* di_advise = 0
- * di_clusterofs = the literal data offset of the cluster
- * di_blkaddr = the blkaddr of the literal cluster
+ * di_clusterofs = the literal data offset of the lcluster
+ * di_blkaddr = the blkaddr of the literal pcluster
*
- * 1 - compressed cluster (for the head logical cluster)
- * di_advise = 1
- * di_clusterofs = the decompressed data offset of the cluster
- * di_blkaddr = the blkaddr of the compressed cluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * di_advise = 1 or 3
+ * di_clusterofs = the decompressed data offset of the lcluster
+ * di_blkaddr = the blkaddr of the compressed pcluster
*
- * 2 - compressed cluster (for the other logical clusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
* di_advise = 2
* di_clusterofs =
- * the decompressed data offset in its own head cluster
- * di_u.delta[0] = distance to its corresponding head cluster
- * di_u.delta[1] = distance to its corresponding tail cluster
- * (di_advise could be 0, 1 or 2)
+ * the decompressed data offset in its own HEAD lcluster
+ * di_u.delta[0] = distance to this HEAD lcluster
+ * di_u.delta[1] = distance to the next HEAD lcluster
*/
enum {
Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
- Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
Z_EROFS_VLE_CLUSTER_TYPE_MAX
};
@@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
/* keep in sync between 2 index structures for better extendibility */
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
sizeof(struct z_erofs_vle_decompressed_index));
+ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index a552399e211d..2345f1de438e 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_flags &= ~S_DAX;
- if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
vi->datalayout == EROFS_INODE_FLAT_PLAIN)
inode->i_flags |= S_DAX;
if (!nblks)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 9524e155b38f..3265688af7f9 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,7 +47,16 @@ typedef u64 erofs_off_t;
/* data type for filesystem-wide blocks number */
typedef u32 erofs_blk_t;
-struct erofs_fs_context {
+struct erofs_device_info {
+ char *path;
+ struct block_device *bdev;
+ struct dax_device *dax_dev;
+
+ u32 blocks;
+ u32 mapped_blkaddr;
+};
+
+struct erofs_mount_opts {
#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
@@ -60,6 +69,18 @@ struct erofs_fs_context {
unsigned int mount_opt;
};
+struct erofs_dev_context {
+ struct idr tree;
+ struct rw_semaphore rwsem;
+
+ unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+ struct erofs_mount_opts opt;
+ struct erofs_dev_context *devs;
+};
+
/* all filesystem-wide lz4 configurations */
struct erofs_sb_lz4_info {
/* # of pages needed for EROFS lz4 rolling decompression */
@@ -69,6 +90,7 @@ struct erofs_sb_lz4_info {
};
struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
struct list_head list;
@@ -85,12 +107,16 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct erofs_dev_context *devs;
struct dax_device *dax_dev;
- u32 blocks;
+ u64 total_blocks;
+ u32 primarydevice_blocks;
+
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
u32 xattr_blkaddr;
#endif
+ u16 device_id_mask; /* valid bits of device id to be used */
/* inode slot unit size in bit shift */
unsigned char islotbits;
@@ -108,8 +134,6 @@ struct erofs_sb_info {
u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
-
- struct erofs_fs_context ctx; /* options */
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -121,9 +145,9 @@ struct erofs_sb_info {
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
enum {
EROFS_ZIP_CACHE_DISABLED,
@@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
EROFS_I_DATALAYOUT_BITS);
}
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
@@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops;
* of the corresponding uncompressed data in the file.
*/
enum {
- BH_Zipped = BH_PrivateStart,
+ BH_Encoded = BH_PrivateStart,
BH_FullMapped,
};
@@ -346,8 +384,8 @@ enum {
#define EROFS_MAP_MAPPED (1 << BH_Mapped)
/* Located in metadata (could be copied from bd_inode) */
#define EROFS_MAP_META (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
@@ -355,6 +393,8 @@ struct erofs_map_blocks {
erofs_off_t m_pa, m_la;
u64 m_plen, m_llen;
+ unsigned short m_deviceid;
+ char m_algorithmformat;
unsigned int m_flags;
struct page *mpage;
@@ -367,6 +407,13 @@ struct erofs_map_blocks {
* approach instead if possible since it's more metadata lightweight.)
*/
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE 0x0004
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
/* zmap.c */
extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+struct erofs_map_dev {
+ struct block_device *m_bdev;
+ struct dax_device *m_daxdev;
+
+ erofs_off_t m_pa;
+ unsigned int m_deviceid;
+};
+
/* data.c */
extern const struct file_operations erofs_file_fops;
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+ struct page *page)
+{
+ set_page_private(page, (unsigned long)*pagepool);
+ *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size) {
+ if (lzma) {
+ erofs_err(sb, "lzma algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
index 6c885575128a..a2efd833d1b6 100644
--- a/fs/erofs/pcpubuf.c
+++ b/fs/erofs/pcpubuf.c
@@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
{
static DEFINE_MUTEX(pcb_resize_mutex);
static unsigned int pcb_nrpages;
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
int delta, cpu, ret, i;
mutex_lock(&pcb_resize_mutex);
@@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages)
vunmap(old_ptr);
free_pagearray:
while (i)
- list_add(&oldpages[--i]->lru, &pagepool);
+ erofs_pagepool_add(&pagepool, oldpages[--i]);
kfree(oldpages);
if (ret)
break;
}
pcb_nrpages = nrpages;
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
out:
mutex_unlock(&pcb_resize_mutex);
return ret;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 11b88559f8bf..6a969b1e0ee6 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
case Z_EROFS_COMPRESSION_LZ4:
ret = z_erofs_load_lz4_config(sb, dsb, data, size);
break;
+ case Z_EROFS_COMPRESSION_LZMA:
+ ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+ break;
default:
DBG_BUGON(1);
ret = -EFAULT;
@@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
+static int erofs_init_devices(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int ondisk_extradevs;
+ erofs_off_t pos;
+ struct page *page = NULL;
+ struct erofs_device_info *dif;
+ struct erofs_deviceslot *dis;
+ void *ptr;
+ int id, err = 0;
+
+ sbi->total_blocks = sbi->primarydevice_blocks;
+ if (!erofs_sb_has_device_table(sbi))
+ ondisk_extradevs = 0;
+ else
+ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+ if (ondisk_extradevs != sbi->devs->extra_devices) {
+ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+ ondisk_extradevs, sbi->devs->extra_devices);
+ return -EINVAL;
+ }
+ if (!ondisk_extradevs)
+ return 0;
+
+ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+ down_read(&sbi->devs->rwsem);
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ erofs_blk_t blk = erofs_blknr(pos);
+ struct block_device *bdev;
+
+ if (!page || page->index != blk) {
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ page = erofs_get_meta_page(sb, blk);
+ if (IS_ERR(page)) {
+ up_read(&sbi->devs->rwsem);
+ return PTR_ERR(page);
+ }
+ ptr = kmap(page);
+ }
+ dis = ptr + erofs_blkoff(pos);
+
+ bdev = blkdev_get_by_path(dif->path,
+ FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
+ goto err_out;
+ }
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev);
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ pos += EROFS_DEVT_SLOT_SIZE;
+ }
+err_out:
+ up_read(&sbi->devs->rwsem);
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ return err;
+}
+
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
@@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->blocks = le32_to_cpu(dsb->blocks);
+ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_load_compr_cfgs(sb, dsb);
else
ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ if (ret < 0)
+ goto out;
+
+ /* handle multiple devices */
+ ret = erofs_init_devices(sb, dsb);
out:
kunmap(page);
put_page(page);
@@ -340,15 +421,15 @@ out:
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- ctx->max_sync_decompress_pages = 3;
- ctx->readahead_sync_decompress = false;
+ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ ctx->opt.max_sync_decompress_pages = 3;
+ ctx->opt.readahead_sync_decompress = false;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
#endif
}
@@ -358,6 +439,7 @@ enum {
Opt_cache_strategy,
Opt_dax,
Opt_dax_enum,
+ Opt_device,
Opt_err
};
@@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
erofs_param_cache_strategy),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
+ fsparam_string("device", Opt_device),
{}
};
@@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(ctx, DAX_ALWAYS);
- clear_opt(ctx, DAX_NEVER);
+ set_opt(&ctx->opt, DAX_ALWAYS);
+ clear_opt(&ctx->opt, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
- set_opt(ctx, DAX_NEVER);
- clear_opt(ctx, DAX_ALWAYS);
+ set_opt(&ctx->opt, DAX_NEVER);
+ clear_opt(&ctx->opt, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
@@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
- struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
+ struct erofs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ struct erofs_device_info *dif;
+ int opt, ret;
opt = fs_parse(fc, erofs_fs_parameters, param, &result);
if (opt < 0)
@@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
if (result.boolean)
- set_opt(ctx, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
else
- clear_opt(ctx, XATTR_USER);
+ clear_opt(&ctx->opt, XATTR_USER);
#else
errorfc(fc, "{,no}user_xattr options not supported");
#endif
@@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
if (result.boolean)
- set_opt(ctx, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
else
- clear_opt(ctx, POSIX_ACL);
+ clear_opt(&ctx->opt, POSIX_ACL);
#else
errorfc(fc, "{,no}acl options not supported");
#endif
break;
case Opt_cache_strategy:
#ifdef CONFIG_EROFS_FS_ZIP
- ctx->cache_strategy = result.uint_32;
+ ctx->opt.cache_strategy = result.uint_32;
#else
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
@@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
if (!erofs_fc_set_dax_mode(fc, result.uint_32))
return -EINVAL;
break;
+ case Opt_device:
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif)
+ return -ENOMEM;
+ dif->path = kstrdup(param->string, GFP_KERNEL);
+ if (!dif->path) {
+ kfree(dif);
+ return -ENOMEM;
+ }
+ down_write(&ctx->devs->rwsem);
+ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&ctx->devs->rwsem);
+ if (ret < 0) {
+ kfree(dif->path);
+ kfree(dif);
+ return ret;
+ }
+ ++ctx->devs->extra_devices;
+ break;
default:
return -ENOPARAM;
}
@@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sbi->opt = ctx->opt;
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->devs = ctx->devs;
+ ctx->devs = NULL;
+
err = erofs_read_superblock(sb);
if (err)
return err;
- if (test_opt(ctx, DAX_ALWAYS) &&
+ if (test_opt(&sbi->opt, DAX_ALWAYS) &&
!dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(ctx, DAX_ALWAYS);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
}
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &erofs_sops;
sb->s_xattr = erofs_xattr_handlers;
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
else
sb->s_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
-
#ifdef CONFIG_EROFS_FS_ZIP
xa_init(&sbi->managed_pslots);
#endif
@@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
DBG_BUGON(!sb_rdonly(sb));
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(&ctx->opt, POSIX_ACL))
fc->sb_flags |= SB_POSIXACL;
else
fc->sb_flags &= ~SB_POSIXACL;
- sbi->ctx = *ctx;
+ sbi->opt = ctx->opt;
fc->sb_flags |= SB_RDONLY;
return 0;
}
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+ struct erofs_device_info *dif = ptr;
+
+ fs_put_dax(dif->dax_dev);
+ if (dif->bdev)
+ blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ kfree(dif->path);
+ kfree(dif);
+ return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+ if (!devs)
+ return;
+ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_destroy(&devs->tree);
+ kfree(devs);
+}
+
static void erofs_fc_free(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ erofs_free_dev_context(ctx->devs);
+ kfree(ctx);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = {
static int erofs_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
- if (!fc->fs_private)
- return -ENOMEM;
+ struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- /* set default mount options */
- erofs_default_options(fc->fs_private);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!ctx->devs) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ fc->fs_private = ctx;
+ idr_init(&ctx->devs->tree);
+ init_rwsem(&ctx->devs->rwsem);
+ erofs_default_options(ctx);
fc->ops = &erofs_context_ops;
-
return 0;
}
@@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb)
sbi = EROFS_SB(sb);
if (!sbi)
return;
+
+ erofs_free_dev_context(sbi->devs);
fs_put_dax(sbi->dax_dev);
kfree(sbi);
sb->s_fs_info = NULL;
@@ -706,6 +843,10 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
+ err = z_erofs_lzma_init();
+ if (err)
+ goto lzma_err;
+
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
@@ -720,6 +861,8 @@ static int __init erofs_module_init(void)
fs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_lzma_exit();
+lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -730,11 +873,13 @@ icache_err:
static void __exit erofs_module_exit(void)
{
unregister_filesystem(&erofs_fs_type);
- z_erofs_exit_zip_subsystem();
- erofs_exit_shrinker();
- /* Ensure all RCU free inodes are safe before cache is destroyed. */
+ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+
+ z_erofs_exit_zip_subsystem();
+ z_erofs_lzma_exit();
+ erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
@@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
- buf->f_blocks = sbi->blocks;
+ buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
buf->f_files = ULLONG_MAX;
@@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
- struct erofs_fs_context *ctx = &sbi->ctx;
+ struct erofs_mount_opts *opt = &sbi->opt;
#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(ctx, XATTR_USER))
+ if (test_opt(opt, XATTR_USER))
seq_puts(seq, ",user_xattr");
else
seq_puts(seq, ",nouser_xattr");
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(ctx, POSIX_ACL))
+ if (test_opt(opt, POSIX_ACL))
seq_puts(seq, ",acl");
else
seq_puts(seq, ",noacl");
#endif
#ifdef CONFIG_EROFS_FS_ZIP
- if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
seq_puts(seq, ",cache_strategy=disabled");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
seq_puts(seq, ",cache_strategy=readahead");
- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
#endif
- if (test_opt(ctx, DAX_ALWAYS))
+ if (test_opt(opt, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
- if (test_opt(ctx, DAX_NEVER))
+ if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
return 0;
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index bd86067a63f7..84da2c280012 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -6,20 +6,29 @@
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
- struct page *page;
+ struct page *page = *pagepool;
- if (!list_empty(pool)) {
- page = lru_to_page(pool);
+ if (page) {
DBG_BUGON(page_ref_count(page) != 1);
- list_del(&page->lru);
+ *pagepool = (struct page *)page_private(page);
} else {
page = alloc_page(gfp);
}
return page;
}
+void erofs_release_pages(struct page **pagepool)
+{
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ put_page(page);
+ }
+}
+
#ifdef CONFIG_EROFS_FS_ZIP
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 778f2c52295d..01c581e93c5f 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
static bool erofs_xattr_user_list(struct dentry *dentry)
{
- return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
+ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
}
static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
switch (handler->flags) {
case EROFS_XATTR_INDEX_USER:
- if (!test_opt(&sbi->ctx, XATTR_USER))
+ if (!test_opt(&sbi->opt, XATTR_USER))
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 11c7a1aaebad..9a249bfc2770 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -96,16 +96,9 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
-/*
- * a compressed_pages[] placeholder in order to avoid
- * being filled with file pages for in-place decompression.
- */
-#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
-
/* how to allocate cached pages for a pcluster */
enum z_erofs_cache_alloctype {
DONTALLOC, /* don't allocate any cached pages */
- DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */
/*
* try to use cached I/O if page allocation succeeds or fallback
* to in-place I/O instead to avoid any direct reclaim.
@@ -236,7 +229,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct z_erofs_pcluster *pcl = clt->pcl;
bool standalone = true;
@@ -267,10 +260,6 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
/* I/O is needed, no possible to decompress directly */
standalone = false;
switch (type) {
- case DELAYEDALLOC:
- t = tagptr_init(compressed_page_t,
- PAGE_UNALLOCATED);
- break;
case TRYALLOC:
newpage = erofs_allocpage(pagepool, gfp);
if (!newpage)
@@ -287,12 +276,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
continue;
- if (page) {
+ if (page)
put_page(page);
- } else if (newpage) {
- set_page_private(newpage, 0);
- list_add(&newpage->lru, pagepool);
- }
+ else if (newpage)
+ erofs_pagepool_add(pagepool, newpage);
}
/*
@@ -373,8 +360,8 @@ static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
/* callers must be with collection lock held */
static int z_erofs_attach_page(struct z_erofs_collector *clt,
- struct page *page,
- enum z_erofs_page_type type)
+ struct page *page, enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
int ret;
@@ -384,9 +371,9 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
z_erofs_try_inplace_io(clt, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector, page, type);
+ ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+ pvec_safereuse);
clt->cl->vcnt += (unsigned int)ret;
-
return ret ? 0 : -EAGAIN;
}
@@ -476,6 +463,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
struct erofs_workgroup *grp;
int err;
+ if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+
/* no available pcluster, let's allocate one */
pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
if (IS_ERR(pcl))
@@ -483,16 +475,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
atomic_set(&pcl->obj.refcount, 1);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
+ pcl->algorithmformat = map->m_algorithmformat;
pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
(map->m_flags & EROFS_MAP_FULL_MAPPED ?
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
- if (map->m_flags & EROFS_MAP_ZIPPED)
- pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
- else
- pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = clt->owned_head;
clt->mode = COLLECT_PRIMARY_FOLLOWED;
@@ -643,7 +630,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct list_head *pagepool)
+ struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -695,7 +682,7 @@ restart_now:
goto err_out;
/* preload all compressed pages (maybe downgrade role if necessary) */
- if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
+ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
@@ -729,7 +716,8 @@ hitted:
tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type);
+ err = z_erofs_attach_page(clt, page, page_type,
+ clt->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
@@ -737,7 +725,7 @@ retry:
set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
err = z_erofs_attach_page(clt, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+ Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
}
@@ -796,7 +784,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
/* Use workqueue and sync decompression for atomic contexts only */
if (in_atomic() || irqs_disabled()) {
queue_work(z_erofs_workqueue, &io->u.work);
- sbi->ctx.readahead_sync_decompress = true;
+ sbi->opt.readahead_sync_decompress = true;
return;
}
z_erofs_decompressqueue_work(&io->u.work);
@@ -836,7 +824,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
static int z_erofs_decompress_pcluster(struct super_block *sb,
struct z_erofs_pcluster *pcl,
- struct list_head *pagepool)
+ struct page **pagepool)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
struct z_erofs_pagevec_ctor ctor;
@@ -1036,7 +1024,7 @@ out:
}
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct list_head *pagepool)
+ struct page **pagepool)
{
z_erofs_next_pcluster_t owned = io->head;
@@ -1060,18 +1048,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
z_erofs_decompress_queue(bgq, &pagepool);
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
kvfree(bgq);
}
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
- struct list_head *pagepool,
+ struct page **pagepool,
struct address_space *mc,
gfp_t gfp)
{
@@ -1091,15 +1079,6 @@ repeat:
if (!page)
goto out_allocpage;
- /*
- * the cached page has not been allocated and
- * an placeholder is out there, prepare it now.
- */
- if (page == PAGE_UNALLOCATED) {
- tocache = true;
- goto out_allocpage;
- }
-
/* process the target tagged pointer */
t = tagptr_init(compressed_page_t, page);
justfound = tagptr_unfold_tags(t);
@@ -1173,7 +1152,7 @@ repeat:
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- list_add(&page->lru, pagepool);
+ erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
}
@@ -1257,7 +1236,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool,
+ struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
@@ -1266,8 +1245,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
- /* since bio will be NULL, no need to initialize last_index */
+ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
+ struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
@@ -1279,6 +1259,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
q[JQ_SUBMIT]->head = owned_head;
do {
+ struct erofs_map_dev mdev;
struct z_erofs_pcluster *pcl;
pgoff_t cur, end;
unsigned int i = 0;
@@ -1290,7 +1271,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- cur = pcl->obj.index;
+ /* no device id here, thus it will always succeed */
+ mdev = (struct erofs_map_dev) {
+ .m_pa = blknr_to_addr(pcl->obj.index),
+ };
+ (void)erofs_map_dev(sb, &mdev);
+
+ cur = erofs_blknr(mdev.m_pa);
end = cur + pcl->pclusterpages;
/* close the main owned chain at first */
@@ -1306,7 +1293,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
if (!page)
continue;
- if (bio && cur != last_index + 1) {
+ if (bio && (cur != last_index + 1 ||
+ last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
bio = NULL;
@@ -1314,9 +1302,10 @@ submit_bio_retry:
if (!bio) {
bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
-
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, sb->s_bdev);
+
+ bio_set_dev(bio, mdev.m_bdev);
+ last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
@@ -1355,7 +1344,7 @@ submit_bio_retry:
static void z_erofs_runqueue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
- struct list_head *pagepool, bool force_fg)
+ struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
@@ -1377,18 +1366,87 @@ static void z_erofs_runqueue(struct super_block *sb,
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+ struct readahead_control *rac,
+ erofs_off_t end,
+ struct page **pagepool,
+ bool backmost)
+{
+ struct inode *inode = f->inode;
+ struct erofs_map_blocks *map = &f->map;
+ erofs_off_t cur;
+ int err;
+
+ if (backmost) {
+ map->m_la = end;
+ err = z_erofs_map_blocks_iter(inode, map,
+ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+ /* expend ra for the trailing edge if readahead */
+ if (rac) {
+ loff_t newstart = readahead_pos(rac);
+
+ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+ readahead_expand(rac, newstart, cur - newstart);
+ return;
+ }
+ end = round_up(end, PAGE_SIZE);
+ } else {
+ end = round_up(map->m_la, PAGE_SIZE);
+
+ if (!map->m_llen)
+ return;
+ }
+
+ cur = map->m_la + map->m_llen - 1;
+ while (cur >= end) {
+ pgoff_t index = cur >> PAGE_SHIFT;
+ struct page *page;
+
+ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+ if (!page)
+ goto skip;
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto skip;
+ }
+
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ put_page(page);
+skip:
+ if (cur < PAGE_SIZE)
+ break;
+ cur = (index << PAGE_SHIFT) - 1;
+ }
+}
+
static int z_erofs_readpage(struct file *file, struct page *page)
{
struct inode *const inode = page->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *pagepool = NULL;
int err;
- LIST_HEAD(pagepool);
trace_erofs_readpage(page, false);
-
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+ &pagepool, true);
err = z_erofs_do_read_page(&f, page, &pagepool);
+ z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
@@ -1400,8 +1458,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
if (f.map.mpage)
put_page(f.map.mpage);
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
return err;
}
@@ -1409,29 +1466,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
- unsigned int nr_pages = readahead_count(rac);
- bool sync = (sbi->ctx.readahead_sync_decompress &&
- nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *page, *head = NULL;
- LIST_HEAD(pagepool);
-
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ struct page *pagepool = NULL, *head = NULL, *page;
+ unsigned int nr_pages;
f.readahead = true;
f.headoffset = readahead_pos(rac);
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
-
- /*
- * A pure asynchronous readahead is indicated if
- * a PG_readahead marked page is hitted at first.
- * Let's also do asynchronous decompression for this case.
- */
- sync &= !(PageReadahead(page) && !head);
+ z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+ readahead_length(rac) - 1, &pagepool, true);
+ nr_pages = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ while ((page = readahead_page(rac))) {
set_page_private(page, (unsigned long)head);
head = page;
}
@@ -1450,16 +1497,15 @@ static void z_erofs_readahead(struct readahead_control *rac)
page->index, EROFS_I(inode)->nid);
put_page(page);
}
-
+ z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
-
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ sbi->opt.readahead_sync_decompress &&
+ nr_pages <= sbi->opt.max_sync_decompress_pages);
if (f.map.mpage)
put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
}
const struct address_space_operations z_erofs_aops = {
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 3a008f1b9f78..4a69515dea75 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -94,13 +94,6 @@ struct z_erofs_decompressqueue {
} u;
};
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
-{
- return page->mapping == MNGD_MAPPING(sbi);
-}
-
#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
@@ -186,4 +179,3 @@ static inline void z_erofs_onlinepage_endio(struct page *page)
#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
#endif
-
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7a6df35fdc91..660489a7fb64 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err;
+ int err, headnr;
erofs_off_t pos;
struct page *page;
void *kaddr;
@@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
- vi->z_algorithmtype[0], vi->nid);
+ headnr = 0;
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
goto unmap_done;
}
@@ -111,7 +113,7 @@ struct z_erofs_maprecorder {
unsigned long lcn;
/* compression extent information gathered */
- u8 type;
+ u8 type, headtype;
u16 clusterofs;
u16 delta[2];
erofs_blk_t pblk, compressedlcs;
@@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
- if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
}
return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
map->m_la = (lcn << lclusterbits) | m->clusterofs;
break;
default:
@@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
int err;
DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
- if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type != m->headtype);
+
+ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
map->m_plen = 1 << lclusterbits;
return 0;
}
-
lcn = m->lcn + 1;
if (m->compressedlcs)
goto out;
@@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
switch (m->type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
@@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
DBG_BUGON(!m->delta[1] &&
m->clusterofs != 1 << lclusterbits);
} else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
/* go on until the next HEAD lcluster */
if (lcn != headlcn)
break;
@@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
if (err)
goto unmap_out;
- map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
switch (m.type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- if (endoff >= m.clusterofs)
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- fallthrough;
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
break;
}
@@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode,
map->m_llen = end - map->m_la;
map->m_pa = blknr_to_addr(m.pblk);
- map->m_flags |= EROFS_MAP_MAPPED;
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto out;
- if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+ map->m_algorithmformat = vi->z_algorithmtype[1];
+ else
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+
+ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+ ((flags & EROFS_GET_BLOCKS_READMORE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+ map->m_llen >= EROFS_BLKSIZ)) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index dfd7fe0503bb..b05464f4a808 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -106,11 +106,18 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
struct page *page,
- enum z_erofs_page_type type)
+ enum z_erofs_page_type type,
+ bool pvec_safereuse)
{
- if (!ctor->next && type)
- if (ctor->index + 1 == ctor->nr)
+ if (!ctor->next) {
+ /* some pages cannot be reused as pvec safely without I/O */
+ if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
+ type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
+
+ if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+ ctor->index + 1 == ctor->nr)
return false;
+ }
if (ctor->index >= ctor->nr)
z_erofs_pagevec_ctor_pagedown(ctor, false);
diff --git a/fs/exec.c b/fs/exec.c
index a098c133d8d7..537d92c41105 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -987,16 +987,14 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm) {
/*
- * Make sure that if there is a core dump in progress
- * for the old mm, we get out and die instead of going
- * through with the exec. We must hold mmap_lock around
- * checking core_state and changing tsk->mm.
+ * If there is a pending fatal signal perhaps a signal
+ * whose default action is to create a coredump get
+ * out and die instead of going through with the exec.
*/
- mmap_read_lock(old_mm);
- if (unlikely(old_mm->core_state)) {
- mmap_read_unlock(old_mm);
+ ret = mmap_read_lock_killable(old_mm);
+ if (ret) {
up_write(&tsk->signal->exec_update_lock);
- return -EINTR;
+ return ret;
}
}
@@ -1852,7 +1850,7 @@ out:
* SIGSEGV.
*/
if (bprm->point_of_no_return && !fatal_signal_pending(current))
- force_sigsegv(SIGSEGV);
+ force_fatal_sig(SIGSEGV);
out_unmark:
current->fs->in_exec = 0;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index ca37d4344361..1c7aa1ea4724 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -604,7 +604,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
exfat_save_attr(inode, info->attr);
inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) &
- ~(sbi->cluster_size - 1)) >> inode->i_blkbits;
+ ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits;
inode->i_mtime = info->mtime;
inode->i_ctime = info->mtime;
ei->i_crtime = info->crtime;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3825195539d7..404dd50856e5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -17,6 +17,7 @@
#ifndef _EXT4_H
#define _EXT4_H
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
@@ -241,7 +242,7 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
- atomic_t count; /* reference counter */
+ refcount_t count; /* reference counter */
struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e02571f2f82..0ecf819bf189 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -136,15 +136,25 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, inode->i_sb,
- path->p_bh, EXT4_JTR_NONE);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ path->p_bh, EXT4_JTR_NONE);
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -165,6 +175,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -354,9 +367,13 @@ static int ext4_valid_extent_idx(struct inode *inode,
static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
- ext4_fsblk_t *pblk, int depth)
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+ int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -365,31 +382,51 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
+ prev = lblock + ext4_ext_get_actual_len(ext) - 1;
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if ((lblock <= prev) && prev) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ prev = lblock;
}
}
return 1;
@@ -397,7 +434,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -423,7 +460,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -453,7 +490,7 @@ corrupted:
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -486,16 +523,18 @@ static void ext4_cache_extents(struct inode *inode,
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+ ext4_fsblk_t pblk;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
+ pblk = ext4_idx_pblock(idx);
bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -508,8 +547,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
if (err)
goto errout;
set_buffer_verified(bh);
@@ -527,8 +566,8 @@ errout:
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -578,8 +617,7 @@ int ext4_ext_precache(struct inode *inode)
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -714,13 +752,14 @@ ext4_ext_binsearch_idx(struct inode *inode,
r = EXT_LAST_INDEX(eh);
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
+ r, le32_to_cpu(r->ei_block));
+
if (block < le32_to_cpu(m->ei_block))
r = m - 1;
else
l = m + 1;
- ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
- le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
- r, le32_to_cpu(r->ei_block));
}
path->p_idx = l - 1;
@@ -782,13 +821,14 @@ ext4_ext_binsearch(struct inode *inode,
while (l <= r) {
m = l + (r - l) / 2;
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
+ r, le32_to_cpu(r->ee_block));
+
if (block < le32_to_cpu(m->ee_block))
r = m - 1;
else
l = m + 1;
- ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
- le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
- r, le32_to_cpu(r->ee_block));
}
path->p_ext = l - 1;
@@ -884,8 +924,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -1494,7 +1533,6 @@ static int ext4_ext_search_right(struct inode *inode,
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1561,20 +1599,17 @@ got_index:
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2953,9 +2988,9 @@ again:
ext_debug(inode, "move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1,
+ EXT4_EX_NOCACHE);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -4978,36 +5013,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
/*
- * ext4_access_path:
- * Function to access the path buffer for marking it dirty.
- * It also checks if there are sufficient credits left in the journal handle
- * to update path.
- */
-static int
-ext4_access_path(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
-{
- int credits, err;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- /*
- * Check if need to extend journal credits
- * 3 for leaf, sb, and inode plus 2 (bmap and group
- * descriptor) for each block group; assume two block
- * groups
- */
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
- if (err < 0)
- return err;
-
- err = ext4_ext_get_access(handle, inode, path);
- return err;
-}
-
-/*
* ext4_ext_shift_path_extents:
* Shift the extents of a path structure lying between path[depth].p_ext
* and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
@@ -5021,6 +5026,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
bool update = false;
+ int credits, restart_credits;
depth = path->p_depth;
while (depth >= 0) {
@@ -5030,13 +5036,26 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ /* leaf + sb + inode */
+ credits = 3;
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+ update = true;
+ /* extent tree + sb + inode */
+ credits = depth + 2;
+ }
- err = ext4_access_path(handle, inode, path + depth);
- if (err)
+ restart_credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ restart_credits, 0);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
- if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = true;
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
@@ -5067,7 +5086,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
}
/* Update index too */
- err = ext4_access_path(handle, inode, path + depth);
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
@@ -5106,6 +5125,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
int ret = 0, depth;
struct ext4_extent *extent;
ext4_lblk_t stop, *iterator, ex_start, ex_end;
+ ext4_lblk_t tmp = EXT_MAX_BLOCKS;
/* Let path point to the last extent */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
@@ -5159,11 +5179,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* till we reach stop. In case of right shift, iterator points to stop
* and it is decreased till we reach start.
*/
+again:
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
iterator = &stop;
+ if (tmp != EXT_MAX_BLOCKS)
+ *iterator = tmp;
+
/*
* Its safe to start updating extents. Start and stop are unsigned, so
* in case of right shift if extent with 0 block is reached, iterator
@@ -5192,6 +5216,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
}
+ tmp = *iterator;
if (SHIFT == SHIFT_LEFT) {
extent = EXT_LAST_EXTENT(path[depth].p_hdr);
*iterator = le32_to_cpu(extent->ee_block) +
@@ -5210,6 +5235,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
handle, SHIFT);
+ /* iterator can be NULL which means we should break */
+ if (ret == -EAGAIN)
+ goto again;
if (ret)
break;
}
@@ -6043,6 +6071,9 @@ int ext4_ext_clear_bb(struct inode *inode)
int j, ret = 0;
struct ext4_map_blocks map;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ return 0;
+
/* Determin the size of the file first */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
EXT4_EX_NOCACHE);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 8ea5a81e6554..0f32b445582a 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -819,7 +819,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
if (ret)
return ret;
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ inode_len = EXT4_INODE_SIZE(inode->i_sb);
+ else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
inode_len += ei->i_extra_isize;
fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
@@ -1524,7 +1526,8 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
* crashing. This should be fixed but until then, we calculate
* the number of blocks the inode.
*/
- ext4_ext_replay_set_iblocks(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+ ext4_ext_replay_set_iblocks(inode);
inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
ext4_reset_inode_seed(inode);
@@ -1842,6 +1845,10 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
}
cur = 0;
end = EXT_MAX_BLOCKS;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
+ iput(inode);
+ continue;
+ }
while (cur < end) {
map.m_lblk = cur;
map.m_len = end - cur;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac0e11bbb445..4c5f41052351 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
+ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
+ 0);
if (ret == -ENOTBLK)
ret = 0;
@@ -915,7 +916,7 @@ const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0f06305167d5..bfd3545f1e5d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1711,16 +1711,13 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
}
/*
- * the buffer head associated with a delayed and not unwritten
- * block found in the extent status cache must contain an
- * invalid block number and have its BH_New and BH_Delay bits
- * set, reflecting the state assigned when the block was
- * initially delayed allocated
+ * Delayed extent could be allocated by fallocate.
+ * So we need to check it.
*/
- if (ext4_es_is_delonly(&es)) {
- BUG_ON(bh->b_blocknr != invalid_block);
- BUG_ON(!buffer_new(bh));
- BUG_ON(!buffer_delay(bh));
+ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
return 0;
}
@@ -4234,14 +4231,161 @@ out_trace:
return err;
}
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return inode_peek_iversion_raw(inode);
+ else
+ return inode_peek_iversion(inode);
+}
+
+static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ struct inode *inode = &(ei->vfs_inode);
+ u64 i_blocks = READ_ONCE(inode->i_blocks);
+ struct super_block *sb = inode->i_sb;
+
+ if (i_blocks <= ~0U) {
+ /*
+ * i_blocks can be represented in a 32 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = 0;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+
+ /*
+ * This should never happen since sb->s_maxbytes should not have
+ * allowed this, sb->s_maxbytes was set according to the huge_file
+ * feature in ext4_fill_super().
+ */
+ if (!ext4_has_feature_huge_file(sb))
+ return -EFSCORRUPTED;
+
+ if (i_blocks <= 0xffffffffffffULL) {
+ /*
+ * i_blocks can be represented in a 48 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ } else {
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ /* i_block is stored in file system block size */
+ i_blocks = i_blocks >> (inode->i_blkbits - 9);
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ }
+ return 0;
+}
+
+static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ uid_t i_uid;
+ gid_t i_gid;
+ projid_t i_projid;
+ int block;
+ int err;
+
+ err = ext4_inode_blocks_set(raw_inode, ei);
+
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ i_uid = i_uid_read(inode);
+ i_gid = i_gid_read(inode);
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
+ /*
+ * Fix up interoperability with old kernels. Otherwise,
+ * old inodes get re-used with the upper 16 bits of the
+ * uid/gid intact.
+ */
+ if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ } else {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(i_gid));
+ }
+ } else {
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+ ext4_isize_set(raw_inode, ei->i_disksize);
+
+ raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ raw_inode->i_block[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ raw_inode->i_block[1] = 0;
+ } else {
+ raw_inode->i_block[0] = 0;
+ raw_inode->i_block[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ raw_inode->i_block[2] = 0;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+ }
+
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ u64 ivers = ext4_inode_peek_iversion(inode);
+
+ raw_inode->i_disk_version = cpu_to_le32(ivers);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(ivers >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
+ }
+
+ if (i_projid != EXT4_DEF_PROJID &&
+ !ext4_has_feature_project(inode->i_sb))
+ err = err ?: -EFSCORRUPTED;
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+
+ ext4_inode_csum_set(inode, raw_inode, ei);
+ return err;
+}
+
/*
* ext4_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
+ * underlying buffer_head on success. If we pass 'inode' and it does not
+ * have in-inode xattr, we have all inode data in memory that is needed
+ * to recreate the on-disk version of this inode.
*/
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
- struct ext4_iloc *iloc, int in_mem,
+ struct inode *inode, struct ext4_iloc *iloc,
ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
@@ -4287,7 +4431,7 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
* is the only valid inode in the block, we need not read the
* block.
*/
- if (in_mem) {
+ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
struct buffer_head *bitmap_bh;
int i, start;
@@ -4315,8 +4459,13 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
}
brelse(bitmap_bh);
if (i == start + inodes_per_block) {
+ struct ext4_inode *raw_inode =
+ (struct ext4_inode *) (bh->b_data + iloc->offset);
+
/* all other inodes are free, so skip I/O */
memset(bh->b_data, 0, bh->b_size);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ ext4_fill_raw_inode(inode, raw_inode);
set_buffer_uptodate(bh);
unlock_buffer(bh);
goto has_buffer;
@@ -4377,7 +4526,7 @@ static int __ext4_get_inode_loc_noinmem(struct inode *inode,
ext4_fsblk_t err_blk;
int ret;
- ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
&err_blk);
if (ret == -EIO)
@@ -4392,9 +4541,8 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
ext4_fsblk_t err_blk;
int ret;
- /* We have all inode data except xattrs in memory here. */
- ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
+ &err_blk);
if (ret == -EIO)
ext4_error_inode_block(inode, err_blk, EIO,
@@ -4407,7 +4555,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
struct ext4_iloc *iloc)
{
- return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
+ return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}
static bool ext4_should_enable_dax(struct inode *inode)
@@ -4528,13 +4676,6 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
else
inode_set_iversion_queried(inode, val);
}
-static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
-{
- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return inode_peek_iversion_raw(inode);
- else
- return inode_peek_iversion(inode);
-}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
@@ -4855,51 +4996,6 @@ bad_inode:
return ERR_PTR(ret);
}
-static int ext4_inode_blocks_set(handle_t *handle,
- struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
-{
- struct inode *inode = &(ei->vfs_inode);
- u64 i_blocks = READ_ONCE(inode->i_blocks);
- struct super_block *sb = inode->i_sb;
-
- if (i_blocks <= ~0U) {
- /*
- * i_blocks can be represented in a 32 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = 0;
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- return 0;
- }
-
- /*
- * This should never happen since sb->s_maxbytes should not have
- * allowed this, sb->s_maxbytes was set according to the huge_file
- * feature in ext4_fill_super().
- */
- if (!ext4_has_feature_huge_file(sb))
- return -EFSCORRUPTED;
-
- if (i_blocks <= 0xffffffffffffULL) {
- /*
- * i_blocks can be represented in a 48 bit variable
- * as multiple of 512 bytes
- */
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- } else {
- ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
- /* i_block is stored in file system block size */
- i_blocks = i_blocks >> (inode->i_blkbits - 9);
- raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
- raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
- }
- return 0;
-}
-
static void __ext4_update_other_inode_time(struct super_block *sb,
unsigned long orig_ino,
unsigned long ino,
@@ -4975,11 +5071,8 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
struct super_block *sb = inode->i_sb;
- int err = 0, block;
+ int err;
int need_datasync = 0, set_large_file = 0;
- uid_t i_uid;
- gid_t i_gid;
- projid_t i_projid;
spin_lock(&ei->i_raw_lock);
@@ -4990,97 +5083,15 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- err = ext4_inode_blocks_set(handle, raw_inode, ei);
-
- raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- i_uid = i_uid_read(inode);
- i_gid = i_gid_read(inode);
- i_projid = from_kprojid(&init_user_ns, ei->i_projid);
- if (!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
- /*
- * Fix up interoperability with old kernels. Otherwise,
- * old inodes get re-used with the upper 16 bits of the
- * uid/gid intact.
- */
- if (ei->i_dtime && list_empty(&ei->i_orphan)) {
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- } else {
- raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(i_uid));
- raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(i_gid));
- }
- } else {
- raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- }
- raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
- EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
-
- raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
- raw_inode->i_file_acl_high =
- cpu_to_le16(ei->i_file_acl >> 32);
- raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) {
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
need_datasync = 1;
- }
if (ei->i_disksize > 0x7fffffffULL) {
if (!ext4_has_feature_large_file(sb) ||
- EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV))
+ EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
}
- raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- raw_inode->i_block[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- raw_inode->i_block[1] = 0;
- } else {
- raw_inode->i_block[0] = 0;
- raw_inode->i_block[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- raw_inode->i_block[2] = 0;
- }
- } else if (!ext4_has_inline_data(inode)) {
- for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
- }
-
- if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- u64 ivers = ext4_inode_peek_iversion(inode);
-
- raw_inode->i_disk_version = cpu_to_le32(ivers);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(ivers >> 32);
- raw_inode->i_extra_isize =
- cpu_to_le16(ei->i_extra_isize);
- }
- }
- if (i_projid != EXT4_DEF_PROJID &&
- !ext4_has_feature_project(inode->i_sb))
- err = err ?: -EFSCORRUPTED;
-
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
- EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
- raw_inode->i_projid = cpu_to_le32(i_projid);
-
- ext4_inode_csum_set(inode, raw_inode, ei);
+ err = ext4_fill_raw_inode(inode, raw_inode);
spin_unlock(&ei->i_raw_lock);
if (err) {
EXT4_ERROR_INODE(inode, "corrupted inode contents");
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 72bfac2d6dce..215b7068f548 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -6299,7 +6299,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
ext4_grpblk_t next, count, free_count;
void *bitmap;
- int ret = 0;
bitmap = e4b->bd_bitmap;
start = (e4b->bd_info->bb_first_free > start) ?
@@ -6314,10 +6313,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start, next - start, e4b);
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
+
if (ret && ret != -EOPNOTSUPP)
break;
- ret = 0;
count += next - start;
}
free_count += next - start;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index da7698341d7d..52c9bd154122 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1439,7 +1439,7 @@ static bool ext4_match(struct inode *parent,
fname->hinfo.minor_hash !=
EXT4_DIRENT_MINOR_HASH(de)) {
- return 0;
+ return false;
}
}
return !ext4_ci_compare(parent, &cf, de->name,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index f038d578d8d8..9cb261714991 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -279,14 +279,14 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
io_end->inode = inode;
INIT_LIST_HEAD(&io_end->list);
INIT_LIST_HEAD(&io_end->list_vec);
- atomic_set(&io_end->count, 1);
+ refcount_set(&io_end->count, 1);
}
return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- if (atomic_dec_and_test(&io_end->count)) {
+ if (refcount_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
@@ -300,7 +300,7 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
- if (atomic_dec_and_test(&io_end->count)) {
+ if (refcount_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_io_end_vec(io_end->handle,
io_end);
@@ -314,7 +314,7 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
- atomic_inc(&io_end->count);
+ refcount_inc(&io_end->count);
return io_end;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..4e33b5eca694 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,6 +46,7 @@
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/fsnotify.h>
#include "ext4.h"
#include "ext4_extents.h" /* Needed for trace points definition */
@@ -759,6 +760,8 @@ void __ext4_error(struct super_block *sb, const char *function,
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
+ fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
+
ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}
@@ -789,6 +792,8 @@ void __ext4_error_inode(struct inode *inode, const char *function,
current->comm, &vaf);
va_end(args);
}
+ fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
+
ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
function, line);
}
@@ -827,6 +832,8 @@ void __ext4_error_file(struct file *file, const char *function,
current->comm, path, &vaf);
va_end(args);
}
+ fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
+
ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
function, line);
}
@@ -894,6 +901,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
sb->s_id, function, line, errstr);
}
+ fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}
@@ -1572,7 +1580,6 @@ static const struct fscrypt_operations ext4_cryptops = {
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
- .max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
.get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
@@ -3263,9 +3270,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3309,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -4474,7 +4480,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto cantfind_ext4;
/* check blocks count against device size */
- blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ blocks_count = sb_bdev_nr_blocks(sb);
if (blocks_count && ext4_blocks_count(es) > blocks_count) {
ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
"exceeds size of device (%llu blocks)",
@@ -5727,10 +5733,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned long old_sb_flags, vfs_flags;
struct ext4_mount_options old_opts;
- int enable_quota = 0;
ext4_group_t g;
int err = 0;
#ifdef CONFIG_QUOTA
+ int enable_quota = 0;
int i, j;
char *to_free[EXT4_MAXQUOTAS];
#endif
@@ -5821,7 +5827,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
- ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -5935,7 +5941,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EROFS;
goto restore_opts;
}
+#ifdef CONFIG_QUOTA
enable_quota = 1;
+#endif
}
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 83e9bc0f91ff..f1693d45bb78 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -653,7 +653,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
return PTR_ERR(inode);
}
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err) {
iput(inode);
goto err_out;
@@ -705,9 +705,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sbi->sb->s_flags |= SB_ACTIVE;
-
/*
* Turn on quotas which were not enabled for read-only mounts if
* filesystem has quota feature, so that they are updated correctly.
@@ -1162,7 +1159,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
- down_write(&sbi->quota_sem);
+ if (!down_write_trylock(&sbi->quota_sem))
+ return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
} else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index c1bf9ad4c220..a0d5cfab75e4 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <linux/moduleparam.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/lzo.h>
@@ -881,6 +882,25 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
return is_page_in_cluster(cc, index);
}
+bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
+ int index, int nr_pages)
+{
+ unsigned long pgidx;
+ int i;
+
+ if (nr_pages - index < cc->cluster_size)
+ return false;
+
+ pgidx = pvec->pages[index]->index;
+
+ for (i = 1; i < cc->cluster_size; i++) {
+ if (pvec->pages[index + i]->index != pgidx + i)
+ return false;
+ }
+
+ return true;
+}
+
static bool cluster_has_invalid_data(struct compress_ctx *cc)
{
loff_t i_size = i_size_read(cc->inode);
@@ -1530,6 +1550,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
if (cluster_may_compress(cc)) {
err = f2fs_compress_pages(cc);
if (err == -EAGAIN) {
+ add_compr_block_stat(cc->inode, cc->cluster_size);
goto write;
} else if (err) {
f2fs_put_rpages_wbc(cc, wbc, true, 1);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f4fd6c246c9a..9f754aaef558 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1465,10 +1465,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
struct extent_info ei = {0, };
block_t blkaddr;
unsigned int start_pgofs;
+ int bidx = 0;
if (!maxblocks)
return 0;
+ map->m_bdev = inode->i_sb->s_bdev;
+ map->m_multidev_dio =
+ f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag);
+
map->m_len = 0;
map->m_flags = 0;
@@ -1491,6 +1496,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
if (flag == F2FS_GET_BLOCK_DIO)
f2fs_wait_on_block_writeback_range(inode,
map->m_pblk, map->m_len);
+
+ if (map->m_multidev_dio) {
+ block_t blk_addr = map->m_pblk;
+
+ bidx = f2fs_target_device_index(sbi, map->m_pblk);
+
+ map->m_bdev = FDEV(bidx).bdev;
+ map->m_pblk -= FDEV(bidx).start_blk;
+ map->m_len = min(map->m_len,
+ FDEV(bidx).end_blk + 1 - map->m_pblk);
+
+ if (map->m_may_create)
+ f2fs_update_device_state(sbi, inode->i_ino,
+ blk_addr, map->m_len);
+ }
goto out;
}
@@ -1609,6 +1629,9 @@ next_block:
if (flag == F2FS_GET_BLOCK_PRE_AIO)
goto skip;
+ if (map->m_multidev_dio)
+ bidx = f2fs_target_device_index(sbi, blkaddr);
+
if (map->m_len == 0) {
/* preallocated unwritten block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
@@ -1617,10 +1640,15 @@ next_block:
map->m_pblk = blkaddr;
map->m_len = 1;
+
+ if (map->m_multidev_dio)
+ map->m_bdev = FDEV(bidx).bdev;
} else if ((map->m_pblk != NEW_ADDR &&
blkaddr == (map->m_pblk + ofs)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
flag == F2FS_GET_BLOCK_PRE_DIO) {
+ if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev)
+ goto sync_out;
ofs++;
map->m_len++;
} else {
@@ -1673,10 +1701,32 @@ skip:
sync_out:
- /* for hardware encryption, but to avoid potential issue in future */
- if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
+ if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) {
+ /*
+ * for hardware encryption, but to avoid potential issue
+ * in future
+ */
f2fs_wait_on_block_writeback_range(inode,
map->m_pblk, map->m_len);
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ map->m_pblk, map->m_pblk);
+
+ if (map->m_multidev_dio) {
+ block_t blk_addr = map->m_pblk;
+
+ bidx = f2fs_target_device_index(sbi, map->m_pblk);
+
+ map->m_bdev = FDEV(bidx).bdev;
+ map->m_pblk -= FDEV(bidx).start_blk;
+
+ if (map->m_may_create)
+ f2fs_update_device_state(sbi, inode->i_ino,
+ blk_addr, map->m_len);
+
+ f2fs_bug_on(sbi, blk_addr + map->m_len >
+ FDEV(bidx).end_blk + 1);
+ }
+ }
if (flag == F2FS_GET_BLOCK_PRECACHE) {
if (map->m_flags & F2FS_MAP_MAPPED) {
@@ -1696,7 +1746,7 @@ unlock_out:
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
- trace_f2fs_map_blocks(inode, map, err);
+ trace_f2fs_map_blocks(inode, map, create, flag, err);
return err;
}
@@ -1755,6 +1805,9 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = blks_to_bytes(inode, map.m_len);
+
+ if (map.m_multidev_dio)
+ bh->b_bdev = map.m_bdev;
}
return err;
}
@@ -2989,6 +3042,10 @@ readd:
need_readd = false;
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
+ void *fsdata = NULL;
+ struct page *pagep;
+ int ret2;
+
ret = f2fs_init_compress_ctx(&cc);
if (ret) {
done = 1;
@@ -3007,27 +3064,23 @@ readd:
if (unlikely(f2fs_cp_error(sbi)))
goto lock_page;
- if (f2fs_cluster_is_empty(&cc)) {
- void *fsdata = NULL;
- struct page *pagep;
- int ret2;
+ if (!f2fs_cluster_is_empty(&cc))
+ goto lock_page;
- ret2 = f2fs_prepare_compress_overwrite(
+ ret2 = f2fs_prepare_compress_overwrite(
inode, &pagep,
page->index, &fsdata);
- if (ret2 < 0) {
- ret = ret2;
- done = 1;
- break;
- } else if (ret2 &&
- !f2fs_compress_write_end(inode,
- fsdata, page->index,
- 1)) {
- retry = 1;
- break;
- }
- } else {
- goto lock_page;
+ if (ret2 < 0) {
+ ret = ret2;
+ done = 1;
+ break;
+ } else if (ret2 &&
+ (!f2fs_compress_write_end(inode,
+ fsdata, page->index, 1) ||
+ !f2fs_all_cluster_page_loaded(&cc,
+ &pvec, i, nr_pages))) {
+ retry = 1;
+ break;
}
}
#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b339ae89c1ad..ce9fc9f13000 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -55,6 +55,7 @@ enum {
FAULT_DISCARD,
FAULT_WRITE_IO,
FAULT_SLAB_ALLOC,
+ FAULT_DQUOT_INIT,
FAULT_MAX,
};
@@ -561,6 +562,9 @@ enum {
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
+/* dirty segments threshold for triggering CP */
+#define DEFAULT_DIRTY_THRESHOLD 4
+
/* for in-memory extent cache entry */
#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
@@ -617,6 +621,7 @@ struct extent_tree {
F2FS_MAP_UNWRITTEN)
struct f2fs_map_blocks {
+ struct block_device *m_bdev; /* for multi-device dio */
block_t m_pblk;
block_t m_lblk;
unsigned int m_len;
@@ -625,6 +630,7 @@ struct f2fs_map_blocks {
pgoff_t *m_next_extent; /* point to next possible extent */
int m_seg_type;
bool m_may_create; /* indicate it is from write path */
+ bool m_multidev_dio; /* indicate it allows multi-device dio */
};
/* for flag in get_data_block */
@@ -1284,8 +1290,10 @@ enum {
};
enum {
- FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
- FS_MODE_LFS, /* use lfs allocation only */
+ FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
+ FS_MODE_LFS, /* use lfs allocation only */
+ FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */
+ FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */
};
enum {
@@ -1728,12 +1736,15 @@ struct f2fs_sb_info {
/* For shrinker support */
struct list_head s_list;
+ struct mutex umount_mutex;
+ unsigned int shrinker_run_no;
+
+ /* For multi devices */
int s_ndevs; /* number of devices */
struct f2fs_dev_info *devs; /* for device list */
unsigned int dirty_device; /* for checkpoint data flush */
spinlock_t dev_lock; /* protect dirty_device */
- struct mutex umount_mutex;
- unsigned int shrinker_run_no;
+ bool aligned_blksize; /* all devices has the same logical blksize */
/* For write statistics */
u64 sectors_written_start;
@@ -1756,6 +1767,9 @@ struct f2fs_sb_info {
unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */
+ int max_fragment_chunk; /* max chunk size for block fragmentation mode */
+ int max_fragment_hole; /* max hole size for block fragmentation mode */
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -3363,6 +3377,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
*/
int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
+int f2fs_dquot_initialize(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
int f2fs_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
@@ -3492,6 +3507,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio);
+void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
+ block_t blkaddr, unsigned int blkcnt);
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered, bool locked);
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
@@ -3516,6 +3533,16 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
unsigned int segno);
+#define DEF_FRAGMENT_SIZE 4
+#define MIN_FRAGMENT_SIZE 1
+#define MAX_FRAGMENT_SIZE 512
+
+static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi)
+{
+ return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG ||
+ F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK;
+}
+
/*
* checkpoint.c
*/
@@ -4027,6 +4054,8 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
block_t blkaddr);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
+bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
+ int index, int nr_pages);
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -4152,8 +4181,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
if (!f2fs_compressed_file(inode))
return true;
- if (S_ISREG(inode->i_mode) &&
- (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks)))
+ if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
return false;
fi->i_flags &= ~F2FS_COMPR_FL;
@@ -4302,6 +4330,16 @@ static inline int block_unaligned_IO(struct inode *inode,
return align & blocksize_mask;
}
+static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
+ int flag)
+{
+ if (!f2fs_is_multi_device(sbi))
+ return false;
+ if (flag != F2FS_GET_BLOCK_DIO)
+ return false;
+ return sbi->aligned_blksize;
+}
+
static inline bool f2fs_force_buffered_io(struct inode *inode,
struct kiocb *iocb, struct iov_iter *iter)
{
@@ -4310,7 +4348,9 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
if (f2fs_post_read_required(inode))
return true;
- if (f2fs_is_multi_device(sbi))
+
+ /* disallow direct IO if any of devices has unaligned blksize */
+ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
return true;
/*
* for blkzoned device, fallback direct IO to buffered IO, so
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9c8ef33bd8d3..92ec2699bc85 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -786,7 +786,7 @@ int f2fs_truncate(struct inode *inode)
return -EIO;
}
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -916,7 +916,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
if (is_quota_modification(inode, attr)) {
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
}
@@ -3020,7 +3020,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
}
f2fs_put_page(ipage, 1);
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
@@ -4276,7 +4276,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
size_t target_size = 0;
int err;
- if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
+ if (fault_in_iov_iter_readable(from, iov_iter_count(from)))
set_inode_flag(inode, FI_NO_PREALLOC);
if ((iocb->ki_flags & IOCB_NOWAIT)) {
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 77391e3b7d68..a946ce0ead34 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -14,6 +14,7 @@
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/random.h>
#include "f2fs.h"
#include "node.h"
@@ -257,7 +258,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->max_search = sbi->max_victim_search;
/* let's select beginning hot/small space first in no_heap mode*/
- if (test_opt(sbi, NOHEAP) &&
+ if (f2fs_need_rand_seg(sbi))
+ p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
+ else if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
else
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 56a20d5c15da..ea08f0dfa1bd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -192,7 +192,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
return 0;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9141147b5bb0..0f8b2df3e1e0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -527,7 +527,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
} else if (S_ISLNK(inode->i_mode)) {
if (file_is_encrypt(inode))
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
@@ -754,7 +754,7 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err) {
err = 0;
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 9c528e583c9d..a728a0af9ce0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -74,7 +74,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (err)
goto fail_drop;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
goto fail_drop;
@@ -345,7 +345,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -404,7 +404,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
F2FS_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -460,7 +460,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -598,10 +598,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
goto fail;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
goto fail;
@@ -675,7 +675,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -746,7 +746,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -757,7 +757,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
@@ -803,7 +803,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -841,7 +841,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err;
- err = dquot_initialize(dir);
+ err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -965,16 +965,16 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
return err;
}
- err = dquot_initialize(old_dir);
+ err = f2fs_dquot_initialize(old_dir);
if (err)
goto out;
- err = dquot_initialize(new_dir);
+ err = f2fs_dquot_initialize(new_dir);
if (err)
goto out;
if (new_inode) {
- err = dquot_initialize(new_inode);
+ err = f2fs_dquot_initialize(new_inode);
if (err)
goto out;
}
@@ -1138,11 +1138,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
F2FS_I(new_dentry->d_inode)->i_projid)))
return -EXDEV;
- err = dquot_initialize(old_dir);
+ err = f2fs_dquot_initialize(old_dir);
if (err)
goto out;
- err = dquot_initialize(new_dir);
+ err = f2fs_dquot_initialize(new_dir);
if (err)
goto out;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e863136081b4..556fcd8457f3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1443,6 +1443,7 @@ page_hit:
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
err = -EINVAL;
out_err:
ClearPageUptodate(page);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index ff14a6e5ac1c..18b98cf0465b 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -138,11 +138,6 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD;
}
-static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
-{
- return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8;
-}
-
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 04655511d7f5..6a1b4668d933 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -81,7 +81,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
goto err_out;
@@ -203,7 +203,7 @@ retry:
goto out_put;
}
- err = dquot_initialize(einode);
+ err = f2fs_dquot_initialize(einode);
if (err) {
iput(einode);
goto out_put;
@@ -508,7 +508,7 @@ got_it:
if (IS_ERR(inode))
return PTR_ERR(inode);
- ret = dquot_initialize(inode);
+ ret = f2fs_dquot_initialize(inode);
if (ret) {
iput(inode);
return ret;
@@ -787,8 +787,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
}
#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sbi->sb->s_flags |= SB_ACTIVE;
/* Turn on quotas so that they are updated correctly */
quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
#endif
@@ -816,10 +814,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
- else {
- /* restore s_flags to let iput() trash data */
- sbi->sb->s_flags = s_flags;
- }
+ else
+ f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
skip:
fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a135d2247415..df9ed75f0b7a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -15,6 +15,7 @@
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/sched/signal.h>
+#include <linux/random.h>
#include "f2fs.h"
#include "segment.h"
@@ -529,6 +530,25 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
}
}
+static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
+{
+ int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
+ unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
+ unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
+ unsigned int meta = get_pages(sbi, F2FS_DIRTY_META);
+ unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
+ unsigned int threshold = sbi->blocks_per_seg * factor *
+ DEFAULT_DIRTY_THRESHOLD;
+ unsigned int global_threshold = threshold * 3 / 2;
+
+ if (dents >= threshold || qdata >= threshold ||
+ nodes >= threshold || meta >= threshold ||
+ imeta >= threshold)
+ return true;
+ return dents + qdata + nodes + meta + imeta > global_threshold;
+}
+
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
{
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -547,8 +567,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
else
f2fs_build_free_nids(sbi, false, false);
- if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) ||
- excess_prefree_segs(sbi))
+ if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) ||
+ excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi))
goto do_sync;
/* there is background inflight IO or foreground operation recently */
@@ -561,7 +581,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
goto do_sync;
/* checkpoint is the only way to shrink partial cached entries */
- if (f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
+ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) &&
f2fs_available_free_memory(sbi, INO_ENTRIES))
return;
@@ -2630,6 +2650,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
unsigned short seg_type = curseg->seg_type;
sanity_check_seg_type(sbi, seg_type);
+ if (f2fs_need_rand_seg(sbi))
+ return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
@@ -2681,6 +2703,9 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
curseg->alloc_type = LFS;
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
+ curseg->fragment_remained_chunk =
+ prandom_u32() % sbi->max_fragment_chunk + 1;
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2707,12 +2732,22 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi,
static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
struct curseg_info *seg)
{
- if (seg->alloc_type == SSR)
+ if (seg->alloc_type == SSR) {
seg->next_blkoff =
__next_free_blkoff(sbi, seg->segno,
seg->next_blkoff + 1);
- else
+ } else {
seg->next_blkoff++;
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) {
+ /* To allocate block chunks in different sizes, use random number */
+ if (--seg->fragment_remained_chunk <= 0) {
+ seg->fragment_remained_chunk =
+ prandom_u32() % sbi->max_fragment_chunk + 1;
+ seg->next_blkoff +=
+ prandom_u32() % sbi->max_fragment_hole + 1;
+ }
+ }
+ }
}
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
@@ -3485,24 +3520,30 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
up_read(&SM_I(sbi)->curseg_lock);
}
-static void update_device_state(struct f2fs_io_info *fio)
+void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
+ block_t blkaddr, unsigned int blkcnt)
{
- struct f2fs_sb_info *sbi = fio->sbi;
- unsigned int devidx;
-
if (!f2fs_is_multi_device(sbi))
return;
- devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+ while (1) {
+ unsigned int devidx = f2fs_target_device_index(sbi, blkaddr);
+ unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1;
- /* update device state for fsync */
- f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+ /* update device state for fsync */
+ f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO);
- /* update device state for checkpoint */
- if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
- spin_lock(&sbi->dev_lock);
- f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
- spin_unlock(&sbi->dev_lock);
+ /* update device state for checkpoint */
+ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+ spin_lock(&sbi->dev_lock);
+ f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+ spin_unlock(&sbi->dev_lock);
+ }
+
+ if (blkcnt <= blks)
+ break;
+ blkcnt -= blks;
+ blkaddr += blks;
}
}
@@ -3529,7 +3570,7 @@ reallocate:
goto reallocate;
}
- update_device_state(fio);
+ f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
if (keep_order)
up_read(&fio->sbi->io_order_lock);
@@ -3611,6 +3652,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
goto drop_bio;
}
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ fio->new_blkaddr, fio->new_blkaddr);
+
stat_inc_inplace_blocks(fio->sbi);
if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE)))
@@ -3618,7 +3662,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
else
err = f2fs_submit_page_bio(fio);
if (!err) {
- update_device_state(fio);
+ f2fs_update_device_state(fio->sbi, fio->ino,
+ fio->new_blkaddr, 1);
f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 89fff258727d..46fde9f3f28e 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -314,6 +314,7 @@ struct curseg_info {
unsigned short next_blkoff; /* next block offset to write */
unsigned int zone; /* current zone number */
unsigned int next_segno; /* preallocated segment */
+ int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */
bool inited; /* indicate inmem log is inited */
};
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 78ebc306ee2b..7960ce066c1b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -58,6 +58,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_DISCARD] = "discard error",
[FAULT_WRITE_IO] = "write IO error",
[FAULT_SLAB_ALLOC] = "slab alloc",
+ [FAULT_DQUOT_INIT] = "dquot initialize",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -817,6 +818,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (!strcmp(name, "lfs")) {
F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
+ } else if (!strcmp(name, "fragment:segment")) {
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG;
+ } else if (!strcmp(name, "fragment:block")) {
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK;
} else {
kfree(name);
return -EINVAL;
@@ -1292,7 +1297,7 @@ default_check:
/* Not pass down write hints if the number of active logs is lesser
* than NR_CURSEG_PERSIST_TYPE.
*/
- if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
+ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE)
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
@@ -1896,6 +1901,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, "adaptive");
else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS)
seq_puts(seq, "lfs");
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG)
+ seq_puts(seq, "fragment:segment");
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
+ seq_puts(seq, "fragment:block");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
if (test_opt(sbi, RESERVE_ROOT))
seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u",
@@ -2491,6 +2500,16 @@ retry:
return len - towrite;
}
+int f2fs_dquot_initialize(struct inode *inode)
+{
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) {
+ f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT);
+ return -ESRCH;
+ }
+
+ return dquot_initialize(inode);
+}
+
static struct dquot **f2fs_get_dquots(struct inode *inode)
{
return F2FS_I(inode)->i_dquot;
@@ -2875,6 +2894,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = {
.get_nextdqblk = dquot_get_next_dqblk,
};
#else
+int f2fs_dquot_initialize(struct inode *inode)
+{
+ return 0;
+}
+
int f2fs_quota_sync(struct super_block *sb, int type)
{
return 0;
@@ -2976,7 +3000,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
.set_context = f2fs_set_context,
.get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
- .max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
.get_num_devices = f2fs_get_num_devices,
@@ -3487,7 +3510,7 @@ skip_cross:
NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) {
f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)",
cp_payload, nat_bits_blocks);
- return -EFSCORRUPTED;
+ return 1;
}
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3523,6 +3546,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
sbi->migration_granularity = sbi->segs_per_sec;
sbi->seq_file_ra_mul = MIN_RA_MUL;
+ sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
+ sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -3747,6 +3772,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
unsigned int max_devices = MAX_DEVICES;
+ unsigned int logical_blksize;
int i;
/* Initialize single device information */
@@ -3767,6 +3793,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
if (!sbi->devs)
return -ENOMEM;
+ logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev);
+ sbi->aligned_blksize = true;
+
for (i = 0; i < max_devices; i++) {
if (i > 0 && !RDEV(i).path[0])
@@ -3803,6 +3832,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
/* to release errored devices */
sbi->s_ndevs = i + 1;
+ if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev))
+ sbi->aligned_blksize = false;
+
#ifdef CONFIG_BLK_DEV_ZONED
if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
!f2fs_sb_has_blkzoned(sbi)) {
@@ -4352,6 +4384,8 @@ free_node_inode:
free_stats:
f2fs_destroy_stats(sbi);
free_nm:
+ /* stop discard thread before destroying node manager */
+ f2fs_stop_discard_thread(sbi);
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a32fe31c33b8..7d289249cd7e 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -196,7 +196,7 @@ static ssize_t encoding_show(struct f2fs_attr *a,
struct super_block *sb = sbi->sb;
if (f2fs_sb_has_casefold(sbi))
- return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n",
+ return sysfs_emit(buf, "%s (%d.%d.%d)\n",
sb->s_encoding->charset,
(sb->s_encoding->version >> 16) & 0xff,
(sb->s_encoding->version >> 8) & 0xff,
@@ -245,7 +245,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
static ssize_t main_blkaddr_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)MAIN_BLKADDR(sbi));
}
@@ -551,6 +551,22 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "max_fragment_chunk")) {
+ if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
+ sbi->max_fragment_chunk = t;
+ else
+ return -EINVAL;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "max_fragment_hole")) {
+ if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
+ sbi->max_fragment_hole = t;
+ else
+ return -EINVAL;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -781,6 +797,8 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -859,6 +877,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(seq_file_ra_mul),
ATTR_LIST(gc_segment_mode),
ATTR_LIST(gc_reclaimed_segments),
+ ATTR_LIST(max_fragment_chunk),
+ ATTR_LIST(max_fragment_hole),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 03549b5ba204..fe5acdccaae1 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -136,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp)
* here and not rely on ->open() doing it. This must be done before
* evicting the inline data.
*/
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 1d2d29dcd41c..e348f33bcb2b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -773,7 +773,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- err = dquot_initialize(inode);
+ err = f2fs_dquot_initialize(inode);
if (err)
return err;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0c9b013a85..a6f1c6d426d1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb,
struct fat_bios_param_block *bpb)
{
static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
-
+ sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
struct fat_floppy_defaults *fdefaults = NULL;
int error = -EINVAL;
- sector_t bd_sects;
unsigned i;
- bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
-
/* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
if (!silent)
@@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
ret = writeback_inode(i1);
if (!ret && i2)
ret = writeback_inode(i2);
- if (!ret) {
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
- ret = filemap_flush(mapping);
- }
+ if (!ret)
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 81ec192ce067..67f0e88eed01 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -566,7 +566,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
+ isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
if (!isw)
return;
@@ -624,8 +624,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
int nr;
bool restart = false;
- isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
- sizeof(struct inode *), GFP_KERNEL);
+ isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
+ GFP_KERNEL);
if (!isw)
return restart;
@@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- blk_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
* If we are expecting writeback progress we must submit plugged IO.
*/
if (blk_needs_flush_plug(current))
- blk_schedule_flush_plug(current);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 281d79f8b3d3..713818d74de6 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -732,11 +732,8 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
- if (ret < 0)
- return ret;
- fuse_invalidate_attr(inode);
- fuse_write_update_size(inode, iocb->ki_pos);
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dde341a6388a..79f7eda49e06 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -756,7 +756,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
unsigned ncpy = min(*size, cs->len);
if (val) {
- void *pgaddr = kmap_atomic(cs->pg);
+ void *pgaddr = kmap_local_page(cs->pg);
void *buf = pgaddr + cs->offset;
if (cs->write)
@@ -764,7 +764,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
else
memcpy(*val, buf, ncpy);
- kunmap_atomic(pgaddr);
+ kunmap_local(pgaddr);
*val += ncpy;
}
*size -= ncpy;
@@ -847,6 +847,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
replace_page_cache_page(oldpage, newpage);
+ /*
+ * Release while we have extra ref on stolen page. Otherwise
+ * anon_pipe_buf_release() might think the page can be reused.
+ */
+ pipe_buf_release(cs->pipe, buf);
+
get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
@@ -949,10 +955,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
}
}
if (page) {
- void *mapaddr = kmap_atomic(page);
+ void *mapaddr = kmap_local_page(page);
void *buf = mapaddr + offset;
offset += fuse_copy_do(cs, &buf, &count);
- kunmap_atomic(mapaddr);
+ kunmap_local(mapaddr);
} else
offset += fuse_copy_do(cs, NULL, &count);
}
@@ -1591,7 +1597,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
end = outarg.offset + outarg.size;
if (end > file_size) {
file_size = end;
- fuse_write_update_size(inode, file_size);
+ fuse_write_update_attr(inode, file_size, outarg.size);
}
num = outarg.size;
@@ -2031,8 +2037,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
out_free:
- for (idx = 0; idx < nbuf; idx++)
- pipe_buf_release(pipe, &bufs[idx]);
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
+ }
pipe_unlock(pipe);
kvfree(bufs);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d9b977c0f38d..0654bfedcbb0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -116,7 +116,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
-static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
{
set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
}
@@ -738,14 +738,51 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return create_new_entry(fm, &args, dir, entry, S_IFLNK);
}
-void fuse_update_ctime(struct inode *inode)
+void fuse_flush_time_update(struct inode *inode)
+{
+ int err = sync_inode_metadata(inode, 1);
+
+ mapping_set_error(inode->i_mapping, err);
+}
+
+static void fuse_update_ctime_in_cache(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
inode->i_ctime = current_time(inode);
mark_inode_dirty_sync(inode);
+ fuse_flush_time_update(inode);
}
}
+void fuse_update_ctime(struct inode *inode)
+{
+ fuse_invalidate_attr_mask(inode, STATX_CTIME);
+ fuse_update_ctime_in_cache(inode);
+}
+
+static void fuse_entry_unlinked(struct dentry *entry)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (S_ISDIR(inode->i_mode))
+ clear_nlink(inode);
+ else if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&fi->lock);
+ fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
@@ -762,24 +799,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.in_args[0].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
- struct inode *inode = d_inode(entry);
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
- /*
- * If i_nlink == 0 then unlink doesn't make sense, yet this can
- * happen if userspace filesystem is careless. It would be
- * difficult to enforce correct nlink usage so just ignore this
- * condition here
- */
- if (inode->i_nlink > 0)
- drop_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
- fuse_update_ctime(inode);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -801,9 +822,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.in_args[0].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
- clear_nlink(d_inode(entry));
fuse_dir_changed(dir);
- fuse_invalidate_entry_cache(entry);
+ fuse_entry_unlinked(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -833,24 +853,18 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
err = fuse_simple_request(fm, &args);
if (!err) {
/* ctime changes */
- fuse_invalidate_attr(d_inode(oldent));
fuse_update_ctime(d_inode(oldent));
- if (flags & RENAME_EXCHANGE) {
- fuse_invalidate_attr(d_inode(newent));
+ if (flags & RENAME_EXCHANGE)
fuse_update_ctime(d_inode(newent));
- }
fuse_dir_changed(olddir);
if (olddir != newdir)
fuse_dir_changed(newdir);
/* newent will end up negative */
- if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
- fuse_invalidate_attr(d_inode(newent));
- fuse_invalidate_entry_cache(newent);
- fuse_update_ctime(d_inode(newent));
- }
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent))
+ fuse_entry_unlinked(newent);
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
@@ -916,25 +930,11 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
- /* Contrary to "normal" filesystems it can happen that link
- makes two "logical" inodes point to the same "physical"
- inode. We invalidate the attributes of the old one, so it
- will reflect changes in the backing inode (link count,
- etc.)
- */
- if (!err) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fi->lock);
- fi->attr_version = atomic64_inc_return(&fm->fc->attr_version);
- if (likely(inode->i_nlink < UINT_MAX))
- inc_nlink(inode);
- spin_unlock(&fi->lock);
- fuse_invalidate_attr(inode);
- fuse_update_ctime(inode);
- } else if (err == -EINTR) {
+ if (!err)
+ fuse_update_ctime_in_cache(inode);
+ else if (err == -EINTR)
fuse_invalidate_attr(inode);
- }
+
return err;
}
@@ -944,15 +944,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
unsigned int blkbits;
struct fuse_conn *fc = get_fuse_conn(inode);
- /* see the comment in fuse_change_attributes() */
- if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
- attr->size = i_size_read(inode);
- attr->mtime = inode->i_mtime.tv_sec;
- attr->mtimensec = inode->i_mtime.tv_nsec;
- attr->ctime = inode->i_ctime.tv_sec;
- attr->ctimensec = inode->i_ctime.tv_nsec;
- }
-
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -1030,12 +1021,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
bool sync;
+ u32 inval_mask = READ_ONCE(fi->inval_mask);
+ u32 cache_mask = fuse_get_cache_mask(inode);
if (flags & AT_STATX_FORCE_SYNC)
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
- else if (request_mask & READ_ONCE(fi->inval_mask))
+ else if (request_mask & inval_mask & ~cache_mask)
sync = true;
else
sync = time_before64(fi->i_time, get_jiffies_64());
@@ -1052,11 +1045,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
return err;
}
-int fuse_update_attributes(struct inode *inode, struct file *file)
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
{
- /* Do *not* need to get atime for internal purposes */
- return fuse_update_get_attr(inode, file, NULL,
- STATX_BASIC_STATS & ~STATX_ATIME, 0);
+ return fuse_update_get_attr(inode, file, NULL, mask, 0);
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
@@ -1071,7 +1062,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- inode_lock(parent);
+ inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
goto unlock;
@@ -1561,10 +1552,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
- bool is_wb = fc->writeback_cache;
+ bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode);
loff_t oldsize;
int err;
- bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+ bool trust_local_cmtime = is_wb;
bool fault_blocked = false;
if (!fc->default_permissions)
@@ -1608,7 +1599,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
/* Flush dirty data/metadata before non-truncate SETATTR */
- if (is_wb && S_ISREG(inode->i_mode) &&
+ if (is_wb &&
attr->ia_valid &
(ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
ATTR_TIMES_SET)) {
@@ -1676,10 +1667,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg));
+ attr_timeout(&outarg),
+ fuse_get_cache_mask(inode));
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
- if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ if (!is_wb || is_truncate)
i_size_write(inode, outarg.attr.size);
if (is_truncate) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 11404f8c21c7..9d6c5f6361f7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -211,9 +211,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
spin_unlock(&fi->lock);
truncate_pagecache(inode, 0);
- fuse_invalidate_attr(inode);
- if (fc->writeback_cache)
- file_update_time(file);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
invalidate_inode_pages2(inode->i_mapping);
}
@@ -339,12 +338,6 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- /* see fuse_vma_close() for !writeback_cache case */
- if (fc->writeback_cache)
- write_inode_now(inode, 1);
-
fuse_release_common(file, false);
/* return value is ignored by VFS */
@@ -483,6 +476,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fuse_is_bad(inode))
return -EIO;
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
+ return 0;
+
err = write_inode_now(inode, 1);
if (err)
return err;
@@ -521,7 +517,7 @@ inval_attr_out:
* enabled, i_blocks from cached attr may not be accurate.
*/
if (!err && fm->fc->writeback_cache)
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
return err;
}
@@ -687,7 +683,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
spin_unlock(&fi->lock);
}
- io->iocb->ki_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res);
}
kref_put(&io->refcnt, fuse_io_release);
@@ -793,7 +789,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size &&
+ if (attr_ver >= fi->attr_version && size < inode->i_size &&
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
@@ -1003,7 +999,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
- err = fuse_update_attributes(inode, iocb->ki_filp);
+ err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
if (err)
return err;
}
@@ -1072,7 +1068,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
return err ?: ia->write.out.size;
}
-bool fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1080,12 +1076,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
- if (pos > inode->i_size) {
+ if (written > 0 && pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
spin_unlock(&fi->lock);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+
return ret;
}
@@ -1164,7 +1162,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
again:
err = -EFAULT;
- if (iov_iter_fault_in_readable(ii, bytes))
+ if (fault_in_iov_iter_readable(ii, bytes))
break;
err = -ENOMEM;
@@ -1268,11 +1266,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
kfree(ap->pages);
} while (!err && iov_iter_count(ii));
- if (res > 0)
- fuse_write_update_size(inode, pos);
-
+ fuse_write_update_attr(inode, pos, res);
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
- fuse_invalidate_attr(inode);
return res > 0 ? res : err;
}
@@ -1290,7 +1285,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
- err = fuse_update_attributes(mapping->host, file);
+ err = fuse_update_attributes(mapping->host, file,
+ STATX_SIZE | STATX_MODE);
if (err)
return err;
@@ -1451,7 +1447,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!ia)
return -ENOMEM;
- ia->io = io;
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
inode_lock(inode);
@@ -1561,11 +1556,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
} else {
res = fuse_direct_io(&io, from, &iocb->ki_pos,
FUSE_DIO_WRITE);
+ fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- fuse_invalidate_attr(inode);
- if (res > 0)
- fuse_write_update_size(inode, iocb->ki_pos);
inode_unlock(inode);
return res;
@@ -1776,7 +1769,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
* is enabled, we trust local ctime/mtime.
*/
if (!fc->writeback_cache)
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
@@ -1822,14 +1815,13 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
{
- struct fuse_file *ff = NULL;
+ struct fuse_file *ff;
spin_lock(&fi->lock);
- if (!list_empty(&fi->write_files)) {
- ff = list_entry(fi->write_files.next, struct fuse_file,
- write_entry);
+ ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
+ write_entry);
+ if (ff)
fuse_file_get(ff);
- }
spin_unlock(&fi->lock);
return ff;
@@ -1848,6 +1840,17 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
struct fuse_file *ff;
int err;
+ /*
+ * Inode is always written before the last reference is dropped and
+ * hence this should not be reached from reclaim.
+ *
+ * Writing back the inode from reclaim can deadlock if the request
+ * processing itself needs an allocation. Allocations triggering
+ * reclaim while serving a request can't be prevented, because it can
+ * involve any number of unrelated userspace processes.
+ */
+ WARN_ON(wbc->for_reclaim);
+
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
@@ -2306,15 +2309,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
if (!copied)
goto unlock;
+ pos += copied;
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
- size_t endoff = (pos + copied) & ~PAGE_MASK;
+ size_t endoff = pos & ~PAGE_MASK;
if (endoff)
zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
- fuse_write_update_size(inode, pos + copied);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+
set_page_dirty(page);
unlock:
@@ -2340,12 +2346,15 @@ static int fuse_launder_page(struct page *page)
}
/*
- * Write back dirty pages now, because there may not be any suitable
- * open files later
+ * Write back dirty data/metadata now (there may not be any suitable
+ * open files later for data)
*/
static void fuse_vma_close(struct vm_area_struct *vma)
{
- filemap_write_and_wait(vma->vm_file->f_mapping);
+ int err;
+
+ err = write_inode_now(vma->vm_file->f_mapping->host, 1);
+ mapping_set_error(vma->vm_file->f_mapping, err);
}
/*
@@ -2628,7 +2637,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
fallback:
- err = fuse_update_attributes(inode, file);
+ err = fuse_update_attributes(inode, file, STATX_SIZE);
if (!err)
return generic_file_llseek(file, offset, whence);
else
@@ -2648,7 +2657,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
break;
case SEEK_END:
inode_lock(inode);
- retval = fuse_update_attributes(inode, file);
+ retval = fuse_update_attributes(inode, file, STATX_SIZE);
if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode);
@@ -2869,7 +2878,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
@@ -2891,9 +2900,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
- if (ret > 0)
- fuse_write_update_size(inode, pos);
- else if (ret < 0 && offset + count > i_size)
+ fuse_write_update_attr(inode, pos, ret);
+ if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
@@ -2981,16 +2989,14 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
/* we could have extended the file */
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- bool changed = fuse_write_update_size(inode, offset + length);
-
- if (changed && fm->fc->writeback_cache)
+ if (fuse_write_update_attr(inode, offset + length, length))
file_update_time(file);
}
if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
truncate_pagecache_range(inode, offset, offset + length - 1);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
@@ -3002,6 +3008,8 @@ out:
if (lock_inode)
inode_unlock(inode);
+ fuse_flush_time_update(inode);
+
return err;
}
@@ -3096,12 +3104,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
ALIGN_DOWN(pos_out, PAGE_SIZE),
ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
- if (fc->writeback_cache) {
- fuse_write_update_size(inode_out, pos_out + outarg.size);
- file_update_time(file_out);
- }
-
- fuse_invalidate_attr(inode_out);
+ file_update_time(file_out);
+ fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
err = outarg.size;
out:
@@ -3111,6 +3115,8 @@ out:
inode_unlock(inode_out);
file_accessed(file_in);
+ fuse_flush_time_update(inode_out);
+
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f55f9f94b1a4..198637b41e19 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1031,7 +1031,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version);
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid);
+ u64 attr_valid, u32 cache_mask);
+
+u32 fuse_get_cache_mask(struct inode *inode);
/**
* Initialize the client device
@@ -1065,7 +1067,15 @@ void fuse_wait_aborted(struct fuse_conn *fc);
/**
* Invalidate inode attributes
*/
+
+/* Attributes possibly changed on data modification */
+#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS)
+
+/* Attributes possibly changed on data and/or size modification */
+#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE)
+
void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask);
void fuse_invalidate_entry_cache(struct dentry *entry);
@@ -1148,9 +1158,10 @@ int fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+void fuse_flush_time_update(struct inode *inode);
void fuse_update_ctime(struct inode *inode);
-int fuse_update_attributes(struct inode *inode, struct file *file);
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask);
void fuse_flush_writepages(struct inode *inode);
@@ -1208,7 +1219,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
__poll_t fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
-bool fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 12d49a1914e8..8b89e3ba7df3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,6 +118,9 @@ static void fuse_evict_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ /* Will write inode on close/munmap and in all other dirtiers */
+ WARN_ON(inode->i_state & I_DIRTY_INODE);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
@@ -161,7 +164,7 @@ static ino_t fuse_squash_ino(u64 ino64)
}
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
- u64 attr_valid)
+ u64 attr_valid, u32 cache_mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -181,9 +184,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
- if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ if (!(cache_mask & STATX_MTIME)) {
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
+ }
+ if (!(cache_mask & STATX_CTIME)) {
inode->i_ctime.tv_sec = attr->ctime;
inode->i_ctime.tv_nsec = attr->ctimensec;
}
@@ -215,16 +220,44 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_flags &= ~S_NOSEC;
}
+u32 fuse_get_cache_mask(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ return 0;
+
+ return STATX_MTIME | STATX_CTIME | STATX_SIZE;
+}
+
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- bool is_wb = fc->writeback_cache;
+ u32 cache_mask;
loff_t oldsize;
struct timespec64 old_mtime;
spin_lock(&fi->lock);
+ /*
+ * In case of writeback_cache enabled, writes update mtime, ctime and
+ * may update i_size. In these cases trust the cached value in the
+ * inode.
+ */
+ cache_mask = fuse_get_cache_mask(inode);
+ if (cache_mask & STATX_SIZE)
+ attr->size = i_size_read(inode);
+
+ if (cache_mask & STATX_MTIME) {
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ }
+ if (cache_mask & STATX_CTIME) {
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
+
if ((attr_version != 0 && fi->attr_version > attr_version) ||
test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fi->lock);
@@ -232,7 +265,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
}
old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid);
+ fuse_change_attributes_common(inode, attr, attr_valid, cache_mask);
oldsize = inode->i_size;
/*
@@ -240,11 +273,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
* extend local i_size without keeping userspace server in sync. So,
* attr->size coming from server can be stale. We cannot trust it.
*/
- if (!is_wb || !S_ISREG(inode->i_mode))
+ if (!(cache_mask & STATX_SIZE))
i_size_write(inode, attr->size);
spin_unlock(&fi->lock);
- if (!is_wb && S_ISREG(inode->i_mode)) {
+ if (!cache_mask && S_ISREG(inode->i_mode)) {
bool inval = false;
if (oldsize != attr->size) {
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 546ea3d58fb4..fbc09dab1f85 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -286,11 +286,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
goto out;
- vaddr = kmap_atomic(ap.pages[0]);
+ vaddr = kmap_local_page(ap.pages[0]);
err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
transferred, in_iovs + out_iovs,
(flags & FUSE_IOCTL_COMPAT) != 0);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
if (err)
goto out;
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index bc267832310c..b4e565711045 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -76,11 +76,11 @@ static void fuse_add_dirent_to_cache(struct file *file,
WARN_ON(fi->rdc.pos != pos))
goto unlock;
- addr = kmap_atomic(page);
+ addr = kmap_local_page(page);
if (!offset)
clear_page(addr);
memcpy(addr + offset, dirent, reclen);
- kunmap_atomic(addr);
+ kunmap_local(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
fi->rdc.pos = dirent->off;
unlock:
@@ -454,7 +454,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
* cache; both cases require an up-to-date mtime value.
*/
if (!ctx->pos && fc->auto_inval_data) {
- int err = fuse_update_attributes(inode, file);
+ int err = fuse_update_attributes(inode, file, STATX_MTIME);
if (err)
return err;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 94fc874f5de7..4cfa4bc1f579 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -649,7 +649,7 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
int vq_type)
{
- strncpy(fsvq->name, name, VQ_NAME_LEN);
+ strscpy(fsvq->name, name, VQ_NAME_LEN);
spin_lock_init(&fsvq->lock);
INIT_LIST_HEAD(&fsvq->queued_reqs);
INIT_LIST_HEAD(&fsvq->end_reqs);
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 61dfaf7b7d20..0d3e7177fce0 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -42,10 +42,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
fm->fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
@@ -173,10 +172,9 @@ int fuse_removexattr(struct inode *inode, const char *name)
fm->fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err) {
- fuse_invalidate_attr(inode);
+ if (!err)
fuse_update_ctime(inode);
- }
+
return err;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5414c2c33580..7235d539e969 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -961,46 +961,6 @@ hole_found:
goto out;
}
-static int gfs2_write_lock(struct inode *inode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
- error = gfs2_glock_nq(&ip->i_gh);
- if (error)
- goto out_uninit;
- if (&ip->i_inode == sdp->sd_rindex) {
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-
- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
- GL_NOCACHE, &m_ip->i_gh);
- if (error)
- goto out_unlock;
- }
- return 0;
-
-out_unlock:
- gfs2_glock_dq(&ip->i_gh);
-out_uninit:
- gfs2_holder_uninit(&ip->i_gh);
- return error;
-}
-
-static void gfs2_write_unlock(struct inode *inode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
-
- if (&ip->i_inode == sdp->sd_rindex) {
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-
- gfs2_glock_dq_uninit(&m_ip->i_gh);
- }
- gfs2_glock_dq_uninit(&ip->i_gh);
-}
-
static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
unsigned len)
{
@@ -1118,11 +1078,6 @@ out_qunlock:
return ret;
}
-static inline bool gfs2_iomap_need_write_lock(unsigned flags)
-{
- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
-}
-
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap,
struct iomap *srcmap)
@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
iomap->flags |= IOMAP_F_BUFFER_HEAD;
trace_gfs2_iomap_start(ip, pos, length, flags);
- if (gfs2_iomap_need_write_lock(flags)) {
- ret = gfs2_write_lock(inode);
- if (ret)
- goto out;
- }
-
ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
if (ret)
goto out_unlock;
@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
out_unlock:
- if (ret && gfs2_iomap_need_write_lock(flags))
- gfs2_write_unlock(inode);
release_metapath(&mp);
-out:
trace_gfs2_iomap_end(ip, iomap, ret);
return ret;
}
@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
}
if (unlikely(!written))
- goto out_unlock;
+ return 0;
if (iomap->flags & IOMAP_F_SIZE_CHANGED)
mark_inode_dirty(inode);
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-
-out_unlock:
- if (gfs2_iomap_need_write_lock(flags))
- gfs2_write_unlock(inode);
return 0;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c559827cb6f9..adafaaf7d24d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -213,11 +213,9 @@ void gfs2_set_inode_flags(struct inode *inode)
* @inode: The inode
* @reqflags: The flags to set
* @mask: Indicates which flags are valid
- * @fsflags: The FS_* inode flags passed in
*
*/
-static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
- const u32 fsflags)
+static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -236,11 +234,6 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
if ((new_flags ^ flags) == 0)
goto out;
- error = -EPERM;
- if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
- goto out;
- if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
- goto out;
if (!IS_IMMUTABLE(inode)) {
error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
if (error)
@@ -313,7 +306,7 @@ int gfs2_fileattr_set(struct user_namespace *mnt_userns,
mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
}
- return do_gfs2_set_flags(inode, gfsflags, mask, fsflags);
+ return do_gfs2_set_flags(inode, gfsflags, mask);
}
static int gfs2_getlabel(struct file *filp, char __user *label)
@@ -776,27 +769,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
+static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+ size_t *prev_count,
+ size_t *window_size)
+{
+ char __user *p = i->iov[0].iov_base + i->iov_offset;
+ size_t count = iov_iter_count(i);
+ int pages = 1;
+
+ if (likely(!count))
+ return false;
+ if (ret <= 0 && ret != -EFAULT)
+ return false;
+ if (!iter_is_iovec(i))
+ return false;
+
+ if (*prev_count != count || !*window_size) {
+ int pages, nr_dirtied;
+
+ pages = min_t(int, BIO_MAX_VECS,
+ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
+ nr_dirtied = max(current->nr_dirtied_pause -
+ current->nr_dirtied, 1);
+ pages = min(pages, nr_dirtied);
+ }
+
+ *prev_count = count;
+ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
+ return true;
+}
+
static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct gfs2_holder *gh)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
- size_t count = iov_iter_count(to);
+ size_t prev_count = 0, window_size = 0;
+ size_t written = 0;
ssize_t ret;
- if (!count)
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
+ * physical as well as manual page faults, and we need to disable both
+ * kinds.
+ *
+ * For direct I/O, gfs2 takes the inode glock in deferred mode. This
+ * locking mode is compatible with other deferred holders, so multiple
+ * processes and nodes can do direct I/O to a file at the same time.
+ * There's no guarantee that reads or writes will be atomic. Any
+ * coordination among readers and writers needs to happen externally.
+ */
+
+ if (!iov_iter_count(to))
return 0; /* skip atime */
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
+retry_under_glock:
+ pagefault_disable();
+ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, written);
+ to->nofault = false;
+ pagefault_enable();
+ if (ret > 0)
+ written = ret;
+
+ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+ size_t leftover;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
- gfs2_glock_dq(gh);
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_writeable(to, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh))
+ goto retry;
+ goto retry_under_glock;
+ }
+ }
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return written;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -805,11 +870,21 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- size_t len = iov_iter_count(from);
- loff_t offset = iocb->ki_pos;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
ssize_t ret;
/*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * For writes, iomap_dio_rw only triggers manual page faults, so we
+ * don't need to disable physical ones.
+ */
+
+ /*
* Deferred lock, even if its a write, since we do no allocation on
* this path. All we need to change is the atime, and this lock mode
* ensures that other nodes have flushed their buffered read caches
@@ -818,31 +893,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
* VFS does.
*/
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-
+retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
- if (offset + len > i_size_read(&ip->i_inode))
+ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
+ from->nofault = true;
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, read);
+ from->nofault = false;
+
if (ret == -ENOTBLK)
ret = 0;
+ if (ret > 0)
+ read = ret;
+
+ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_readable(from, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh))
+ goto retry;
+ goto retry_under_glock;
+ }
+ }
out:
- gfs2_glock_dq(gh);
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return read;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct gfs2_inode *ip;
struct gfs2_holder gh;
+ size_t prev_count = 0, window_size = 0;
size_t written = 0;
ssize_t ret;
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
if (iocb->ki_flags & IOCB_DIRECT) {
ret = gfs2_file_direct_read(iocb, to, &gh);
if (likely(ret != -ENOTBLK))
@@ -864,18 +970,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
+retry_under_glock:
+ pagefault_disable();
ret = generic_file_read_iter(iocb, to);
+ pagefault_enable();
if (ret > 0)
written += ret;
- gfs2_glock_dq(&gh);
+
+ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(&gh);
+ leftover = fault_in_iov_iter_writeable(to, window_size);
+ gfs2_holder_disallow_demote(&gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(&gh)) {
+ if (written)
+ goto out_uninit;
+ goto retry;
+ }
+ goto retry_under_glock;
+ }
+ }
+ if (gfs2_holder_queued(&gh))
+ gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
return written ? written : ret;
}
+static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from,
+ struct gfs2_holder *gh)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder *statfs_gh = NULL;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
+ ssize_t ret;
+
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
+ if (inode == sdp->sd_rindex) {
+ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
+ if (!statfs_gh)
+ return -ENOMEM;
+ }
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
+retry:
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+retry_under_glock:
+ if (inode == sdp->sd_rindex) {
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, statfs_gh);
+ if (ret)
+ goto out_unlock;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ pagefault_disable();
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ pagefault_enable();
+ current->backing_dev_info = NULL;
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+
+ if (inode == sdp->sd_rindex)
+ gfs2_glock_dq_uninit(statfs_gh);
+
+ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_readable(from, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh)) {
+ if (read)
+ goto out_uninit;
+ goto retry;
+ }
+ goto retry_under_glock;
+ }
+ }
+out_unlock:
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
+out_uninit:
+ gfs2_holder_uninit(gh);
+ if (statfs_gh)
+ kfree(statfs_gh);
+ return read ? read : ret;
+}
+
/**
* gfs2_file_write_iter - Perform a write to a file
* @iocb: The io context
@@ -927,9 +1133,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out_unlock;
iocb->ki_flags |= IOCB_DSYNC;
- current->backing_dev_info = inode_to_bdi(inode);
- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
+ buffered = gfs2_file_buffered_write(iocb, from, &gh);
if (unlikely(buffered <= 0)) {
if (!ret)
ret = buffered;
@@ -943,7 +1147,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* the direct I/O range as we don't know if the buffered pages
* made it to disk.
*/
- iocb->ki_pos += buffered;
ret2 = generic_write_sync(iocb, buffered);
invalidate_mapping_pages(mapping,
(iocb->ki_pos - buffered) >> PAGE_SHIFT,
@@ -951,13 +1154,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!ret || ret2 > 0)
ret += ret2;
} else {
- current->backing_dev_info = inode_to_bdi(inode);
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
+ ret = gfs2_file_buffered_write(iocb, from, &gh);
+ if (likely(ret > 0))
ret = generic_write_sync(iocb, ret);
- }
}
out_unlock:
@@ -1338,8 +1537,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- if (fl->fl_type & LOCK_MAND)
- return -EOPNOTSUPP;
if (fl->fl_type == F_UNLCK) {
do_unflock(file, fl);
@@ -1353,7 +1550,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
@@ -1386,7 +1583,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e0eaa9cf9fb6..19f38aee1b61 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
+static void __gfs2_glock_dq(struct gfs2_holder *gh);
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
+ /*
+ * Note that demote_ok is used for the lru process of disposing of
+ * glocks. For this purpose, we don't care if the glock's holders
+ * have the HIF_MAY_DEMOTE flag set or not. If someone is using
+ * them, don't demote.
+ */
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
@@ -294,6 +301,9 @@ void gfs2_glock_queue_put(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
+ /* last put could call sleepable dlm api */
+ might_sleep();
+
if (lockref_put_or_lock(&gl->gl_lockref))
return;
@@ -301,46 +311,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
- * may_grant - check if its ok to grant a new lock
+ * may_grant - check if it's ok to grant a new lock
* @gl: The glock
+ * @current_gh: One of the current holders of @gl
* @gh: The lock request which we wish to grant
*
- * Returns: true if its ok to grant the lock
+ * With our current compatibility rules, if a glock has one or more active
+ * holders (HIF_HOLDER flag set), any of those holders can be passed in as
+ * @current_gh; they are all the same as far as compatibility with the new @gh
+ * goes.
+ *
+ * Returns true if it's ok to grant the lock.
*/
-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
-{
- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
+static inline bool may_grant(struct gfs2_glock *gl,
+ struct gfs2_holder *current_gh,
+ struct gfs2_holder *gh)
+{
+ if (current_gh) {
+ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));
+
+ switch(current_gh->gh_state) {
+ case LM_ST_EXCLUSIVE:
+ /*
+ * Here we make a special exception to grant holders
+ * who agree to share the EX lock with other holders
+ * who also have the bit set. If the original holder
+ * has the LM_FLAG_NODE_SCOPE bit set, we grant more
+ * holders with the bit set.
+ */
+ return gh->gh_state == LM_ST_EXCLUSIVE &&
+ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
+ (gh->gh_flags & LM_FLAG_NODE_SCOPE);
- if (gh != gh_head) {
- /**
- * Here we make a special exception to grant holders who agree
- * to share the EX lock with other holders who also have the
- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit
- * is set, we grant more holders with the bit set.
- */
- if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
- gh->gh_state == LM_ST_EXCLUSIVE &&
- (gh->gh_flags & LM_FLAG_NODE_SCOPE))
- return 1;
- if ((gh->gh_state == LM_ST_EXCLUSIVE ||
- gh_head->gh_state == LM_ST_EXCLUSIVE))
- return 0;
+ case LM_ST_SHARED:
+ case LM_ST_DEFERRED:
+ return gh->gh_state == current_gh->gh_state;
+
+ default:
+ return false;
+ }
}
+
if (gl->gl_state == gh->gh_state)
- return 1;
+ return true;
if (gh->gh_flags & GL_EXACT)
- return 0;
+ return false;
if (gl->gl_state == LM_ST_EXCLUSIVE) {
- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
- return 1;
- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
- return 1;
+ return gh->gh_state == LM_ST_SHARED ||
+ gh->gh_state == LM_ST_DEFERRED;
}
- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
- return 1;
- return 0;
+ if (gh->gh_flags & LM_FLAG_ANY)
+ return gl->gl_state != LM_ST_UNLOCKED;
+ return false;
}
static void gfs2_holder_wake(struct gfs2_holder *gh)
@@ -366,7 +389,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ if (!test_bit(HIF_WAIT, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
@@ -381,6 +404,123 @@ static void do_error(struct gfs2_glock *gl, const int ret)
}
/**
+ * demote_incompat_holders - demote incompatible demoteable holders
+ * @gl: the glock we want to promote
+ * @new_gh: the new holder to be promoted
+ */
+static void demote_incompat_holders(struct gfs2_glock *gl,
+ struct gfs2_holder *new_gh)
+{
+ struct gfs2_holder *gh;
+
+ /*
+ * Demote incompatible holders before we make ourselves eligible.
+ * (This holder may or may not allow auto-demoting, but we don't want
+ * to demote the new holder before it's even granted.)
+ */
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ /*
+ * Since holders are at the front of the list, we stop when we
+ * find the first non-holder.
+ */
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return;
+ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
+ !may_grant(gl, new_gh, gh)) {
+ /*
+ * We should not recurse into do_promote because
+ * __gfs2_glock_dq only calls handle_callback,
+ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
+ */
+ __gfs2_glock_dq(gh);
+ }
+ }
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (!list_empty(&gl->gl_holders)) {
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
+ gh_list);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * find_first_strong_holder - find the first non-demoteable holder
+ * @gl: the glock
+ *
+ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
+ */
+static inline struct gfs2_holder *
+find_first_strong_holder(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return NULL;
+ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/*
+ * gfs2_instantiate - Call the glops instantiate function
+ * @gl: The glock
+ *
+ * Returns: 0 if instantiate was successful, 2 if type specific operation is
+ * underway, or error.
+ */
+int gfs2_instantiate(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int ret;
+
+again:
+ if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
+ return 0;
+
+ /*
+ * Since we unlock the lockref lock, we set a flag to indicate
+ * instantiate is in progress.
+ */
+ if (test_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) {
+ wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG,
+ TASK_UNINTERRUPTIBLE);
+ /*
+ * Here we just waited for a different instantiate to finish.
+ * But that may not have been successful, as when a process
+ * locks an inode glock _before_ it has an actual inode to
+ * instantiate into. So we check again. This process might
+ * have an inode to instantiate, so might be successful.
+ */
+ goto again;
+ }
+
+ set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
+
+ ret = glops->go_instantiate(gh);
+ if (!ret)
+ clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
+ clear_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG);
+ return ret;
+}
+
+/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
@@ -392,44 +532,59 @@ static int do_promote(struct gfs2_glock *gl)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh, *tmp;
+ struct gfs2_holder *gh, *tmp, *first_gh;
+ bool incompat_holders_demoted = false;
+ bool lock_released;
int ret;
restart:
+ first_gh = find_first_strong_holder(gl);
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ lock_released = false;
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
- if (may_grant(gl, gh)) {
- if (gh->gh_list.prev == &gl->gl_holders &&
- glops->go_lock) {
- spin_unlock(&gl->gl_lockref.lock);
- /* FIXME: eliminate this eventually */
- ret = glops->go_lock(gh);
- spin_lock(&gl->gl_lockref.lock);
- if (ret) {
- if (ret == 1)
- return 2;
- gh->gh_error = ret;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- goto restart;
- }
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_promote(gh, 1);
+ if (!may_grant(gl, first_gh, gh)) {
+ /*
+ * If we get here, it means we may not grant this holder for
+ * some reason. If this holder is the head of the list, it
+ * means we have a blocked holder at the head, so return 1.
+ */
+ if (gh->gh_list.prev == &gl->gl_holders)
+ return 1;
+ do_error(gl, 0);
+ break;
+ }
+ if (!incompat_holders_demoted) {
+ demote_incompat_holders(gl, first_gh);
+ incompat_holders_demoted = true;
+ first_gh = gh;
+ }
+ if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) &&
+ !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) {
+ lock_released = true;
+ spin_unlock(&gl->gl_lockref.lock);
+ ret = gfs2_instantiate(gh);
+ spin_lock(&gl->gl_lockref.lock);
+ if (ret) {
+ if (ret == 1)
+ return 2;
+ gh->gh_error = ret;
+ list_del_init(&gh->gh_list);
+ trace_gfs2_glock_queue(gh, 0);
gfs2_holder_wake(gh);
goto restart;
}
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_promote(gh, 0);
- gfs2_holder_wake(gh);
- continue;
}
- if (gh->gh_list.prev == &gl->gl_holders)
- return 1;
- do_error(gl, 0);
- break;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_promote(gh);
+ gfs2_holder_wake(gh);
+ /*
+ * If we released the gl_lockref.lock the holders list may have
+ * changed. For that reason, we start again at the start of
+ * the holders queue.
+ */
+ if (lock_released)
+ goto restart;
}
return 0;
}
@@ -723,23 +878,6 @@ out:
}
/**
- * find_first_holder - find the first "holder" gh
- * @gl: the glock
- */
-
-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
-
- if (!list_empty(&gl->gl_holders)) {
- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- return gh;
- }
- return NULL;
-}
-
-/**
* run_queue - do all outstanding tasks related to a glock
* @gl: The glock in question
* @nonblock: True if we must not block in run_queue
@@ -822,7 +960,7 @@ static void gfs2_glock_poke(struct gfs2_glock *gl)
struct gfs2_holder gh;
int error;
- gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh);
+ __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_);
error = gfs2_glock_nq(&gh);
if (!error)
gfs2_glock_dq(&gh);
@@ -1057,7 +1195,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
atomic_inc(&sdp->sd_glock_disposal);
gl->gl_node.next = NULL;
- gl->gl_flags = 0;
+ gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0;
gl->gl_name = name;
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
gl->gl_lockref.count = 1;
@@ -1119,12 +1257,12 @@ out:
*
*/
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
- struct gfs2_holder *gh)
+void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
+ struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
gh->gh_gl = gl;
- gh->gh_ip = _RET_IP_;
+ gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
@@ -1354,15 +1492,20 @@ __acquires(&gl->gl_lockref.lock)
GLOCK_BUG_ON(gl, true);
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- try_futile = !may_grant(gl, gh);
+ if (test_bit(GLF_LOCK, &gl->gl_flags)) {
+ struct gfs2_holder *first_gh;
+
+ first_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, first_gh, gh);
+ }
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
+ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
goto trap_recursive;
if (try_futile &&
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
@@ -1458,51 +1601,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
-/**
- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
- * @gh: the glock holder
- *
- */
+static inline bool needs_demote(struct gfs2_glock *gl)
+{
+ return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
+ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
+}
-void gfs2_glock_dq(struct gfs2_holder *gh)
+static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_lockref.lock);
/*
- * If we're in the process of file system withdraw, we cannot just
- * dequeue any glocks until our journal is recovered, lest we
- * introduce file system corruption. We need two exceptions to this
- * rule: We need to allow unlocking of nondisk glocks and the glock
- * for our own journal that needs recovery.
+ * This while loop is similar to function demote_incompat_holders:
+ * If the glock is due to be demoted (which may be from another node
+ * or even if this holder is GL_NOCACHE), the weak holders are
+ * demoted as well, allowing the glock to be demoted.
*/
- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
- glock_blocked_by_withdraw(gl) &&
- gh->gh_gl != sdp->sd_jinode_gl) {
- sdp->sd_glock_dqs_held++;
- spin_unlock(&gl->gl_lockref.lock);
- might_sleep();
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&gl->gl_lockref.lock);
- }
- if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ while (gh) {
+ /*
+ * If we're in the process of file system withdraw, we cannot
+ * just dequeue any glocks until our journal is recovered, lest
+ * we introduce file system corruption. We need two exceptions
+ * to this rule: We need to allow unlocking of nondisk glocks
+ * and the glock for our own journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
+ /*
+ * This holder should not be cached, so mark it for demote.
+ * Note: this should be done before the check for needs_demote
+ * below.
+ */
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+
+ list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_glock_queue(gh, 0);
+
+ /*
+ * If there hasn't been a demote request we are done.
+ * (Let the remaining holders, if any, keep holding it.)
+ */
+ if (!needs_demote(gl)) {
+ if (list_empty(&gl->gl_holders))
+ fast_path = 1;
+ break;
+ }
+ /*
+ * If we have another strong holder (we cannot auto-demote)
+ * we are done. It keeps holding it until it is done.
+ */
+ if (find_first_strong_holder(gl))
+ break;
- list_del_init(&gh->gh_list);
- clear_bit(HIF_HOLDER, &gh->gh_iflags);
- if (list_empty(&gl->gl_holders) &&
- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
- !test_bit(GLF_DEMOTE, &gl->gl_flags))
- fast_path = 1;
+ /*
+ * If we have a weak holder at the head of the list, it
+ * (and all others like it) must be auto-demoted. If there
+ * are no more weak holders, we exit the while loop.
+ */
+ gh = find_first_holder(gl);
+ }
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
gfs2_glock_add_to_lru(gl);
- trace_gfs2_glock_queue(gh, 0);
if (unlikely(!fast_path)) {
gl->gl_lockref.count++;
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1511,6 +1686,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
delay = gl->gl_hold_time;
__gfs2_glock_queue_work(gl, delay);
}
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ __gfs2_glock_dq(gh);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1673,6 +1861,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
{
+ struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
unsigned long delay = 0;
unsigned long holdtime;
unsigned long now = jiffies;
@@ -1687,6 +1876,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
+ /*
+ * Note 1: We cannot call demote_incompat_holders from handle_callback
+ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
+ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
+ * Plus, we only want to demote the holders if the request comes from
+ * a remote cluster node because local holder conflicts are resolved
+ * elsewhere.
+ *
+ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
+ * request that we set our state to UNLOCKED. Here we mock up a holder
+ * to make it look like someone wants the lock EX locally. Any SH
+ * and DF requests should be able to share the lock without demoting.
+ *
+ * Note 3: We only want to demote the demoteable holders when there
+ * are no more strong holders. The demoteable holders might as well
+ * keep the glock until the last strong holder is done with it.
+ */
+ if (!find_first_strong_holder(gl)) {
+ if (state == LM_ST_UNLOCKED)
+ mock_gh.gh_state = LM_ST_EXCLUSIVE;
+ demote_incompat_holders(gl, &mock_gh);
+ }
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -1893,10 +2104,10 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
do {
rhashtable_walk_start(&iter);
- while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
- if (gl->gl_name.ln_sbd == sdp &&
- lockref_get_not_dead(&gl->gl_lockref))
+ while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) {
+ if (gl->gl_name.ln_sbd == sdp)
examiner(gl);
+ }
rhashtable_walk_stop(&iter);
} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
@@ -1919,7 +2130,7 @@ bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay)
void gfs2_cancel_delete_work(struct gfs2_glock *gl)
{
- if (cancel_delayed_work_sync(&gl->gl_delete)) {
+ if (cancel_delayed_work(&gl->gl_delete)) {
clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
gfs2_glock_put(gl);
}
@@ -1938,7 +2149,6 @@ static void flush_delete_work(struct gfs2_glock *gl)
&gl->gl_delete, 0);
}
}
- gfs2_glock_queue_work(gl, 0);
}
void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
@@ -1955,10 +2165,10 @@ void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
static void thaw_glock(struct gfs2_glock *gl)
{
- if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) {
- gfs2_glock_put(gl);
+ if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ return;
+ if (!lockref_get_not_dead(&gl->gl_lockref))
return;
- }
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
}
@@ -1974,9 +2184,12 @@ static void clear_glock(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_lock(&gl->gl_lockref.lock);
- if (gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- __gfs2_glock_queue_work(gl, 0);
+ if (!__lockref_is_dead(&gl->gl_lockref)) {
+ gl->gl_lockref.count++;
+ if (gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ __gfs2_glock_queue_work(gl, 0);
+ }
spin_unlock(&gl->gl_lockref.lock);
}
@@ -2076,6 +2289,10 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
+ if (test_bit(HIF_MAY_DEMOTE, &iflags))
+ *p++ = 'D';
+ if (flags & GL_SKIP)
+ *p++ = 's';
*p = 0;
return buf;
}
@@ -2144,6 +2361,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'P';
if (test_bit(GLF_FREEING, gflags))
*p++ = 'x';
+ if (test_bit(GLF_INSTANTIATE_NEEDED, gflags))
+ *p++ = 'n';
+ if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags))
+ *p++ = 'N';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 31a8f2f649b5..4f8642301801 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
break;
+ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+ continue;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -188,13 +190,21 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
extern void gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
-extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- u16 flags, struct gfs2_holder *gh);
+
+extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh,
+ unsigned long ip);
+static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+ u16 flags, struct gfs2_holder *gh) {
+ __gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
+}
+
extern void gfs2_holder_reinit(unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
extern int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_instantiate(struct gfs2_holder *gh);
extern int gfs2_glock_wait(struct gfs2_holder *gh);
extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq(struct gfs2_holder *gh);
@@ -239,7 +249,7 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
{
int error;
- gfs2_holder_init(gl, state, flags, gh);
+ __gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
error = gfs2_glock_nq(gh);
if (error)
@@ -325,6 +335,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
spin_unlock(&gl->gl_lockref.lock);
}
+static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
+static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_lockref.lock);
+ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 79c621c7863d..650ad77c4d0b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -228,7 +228,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
truncate_inode_pages_range(mapping, start, end);
- rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+ set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
}
static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
@@ -356,7 +356,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
struct address_space *mapping = gfs2_glock2aspace(gl);
truncate_inode_pages(mapping, 0);
if (ip) {
- set_bit(GIF_INVALID, &ip->i_flags);
+ set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
forget_all_cached_acls(&ip->i_inode);
security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
@@ -476,33 +476,29 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
error = gfs2_dinode_in(ip, dibh->b_data);
brelse(dibh);
- clear_bit(GIF_INVALID, &ip->i_flags);
-
return error;
}
/**
- * inode_go_lock - operation done after an inode lock is locked by a process
+ * inode_go_instantiate - read in an inode if necessary
* @gh: The glock holder
*
* Returns: errno
*/
-static int inode_go_lock(struct gfs2_holder *gh)
+static int inode_go_instantiate(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
- if (!ip || (gh->gh_flags & GL_SKIP))
- return 0;
+ if (!ip) /* no inode to populate - read it in later */
+ goto out;
- if (test_bit(GIF_INVALID, &ip->i_flags)) {
- error = gfs2_inode_refresh(ip);
- if (error)
- return error;
- }
+ error = gfs2_inode_refresh(ip);
+ if (error)
+ goto out;
if (gh->gh_state != LM_ST_DEFERRED)
inode_dio_wait(&ip->i_inode);
@@ -515,9 +511,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
spin_unlock(&sdp->sd_trunc_lock);
wake_up(&sdp->sd_quota_wait);
- return 1;
+ error = 1;
}
+out:
return error;
}
@@ -740,7 +737,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_sync = inode_go_sync,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
- .go_lock = inode_go_lock,
+ .go_instantiate = inode_go_instantiate,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
@@ -750,7 +747,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_sync = rgrp_go_sync,
.go_inval = rgrp_go_inval,
- .go_lock = gfs2_rgrp_go_lock,
+ .go_instantiate = gfs2_rgrp_go_instantiate,
.go_dump = gfs2_rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_LVB,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0fe49770166e..8c00fb389ae5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -119,7 +119,6 @@ struct gfs2_rgrpd {
u32 rd_flags;
u32 rd_extfail_pt; /* extent failure point */
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
-#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */
#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
@@ -220,7 +219,7 @@ struct gfs2_glock_operations {
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
- int (*go_lock) (struct gfs2_holder *gh);
+ int (*go_instantiate) (struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
@@ -252,6 +251,7 @@ struct gfs2_lkstats {
enum {
/* States */
+ HIF_MAY_DEMOTE = 1,
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_WAIT = 10,
};
@@ -315,6 +315,7 @@ struct gfs2_alloc_parms {
enum {
GLF_LOCK = 1,
+ GLF_INSTANTIATE_NEEDED = 2, /* needs instantiate */
GLF_DEMOTE = 3,
GLF_PENDING_DEMOTE = 4,
GLF_DEMOTE_IN_PROGRESS = 5,
@@ -324,6 +325,7 @@ enum {
GLF_REPLY_PENDING = 9,
GLF_INITIAL = 10,
GLF_FROZEN = 11,
+ GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
@@ -370,7 +372,6 @@ struct gfs2_glock {
};
enum {
- GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
@@ -386,9 +387,8 @@ struct gfs2_inode {
u64 i_generation;
u64 i_eattr;
unsigned long i_flags; /* GIF_... */
- struct gfs2_glock *i_gl; /* Move into i_gh? */
+ struct gfs2_glock *i_gl;
struct gfs2_holder i_iopen_gh;
- struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_qadata *i_qadata; /* quota allocation data */
struct gfs2_holder i_rgd_gh;
struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3130f85d2b3f..6424b903e885 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
}
glock_set_object(ip->i_gl, ip);
- set_bit(GIF_INVALID, &ip->i_flags);
+ set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail;
@@ -196,7 +196,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (type == DT_UNKNOWN) {
/* Inode glock must be locked already */
- error = gfs2_inode_refresh(GFS2_I(inode));
+ error = gfs2_instantiate(&i_gh);
if (error)
goto fail;
} else {
@@ -225,6 +225,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
return inode;
fail:
+ if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
+ glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ }
if (io_gl)
gfs2_glock_put(io_gl);
if (gfs2_holder_initialized(&i_gh))
@@ -727,18 +731,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_free_inode;
flush_delayed_work(&ip->i_gl->gl_work);
- glock_set_object(ip->i_gl, ip);
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (error)
goto fail_free_inode;
gfs2_cancel_delete_work(io_gl);
- glock_set_object(io_gl, ip);
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
if (error)
goto fail_gunlock2;
+ glock_set_object(ip->i_gl, ip);
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
goto fail_gunlock2;
@@ -754,6 +757,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock2;
+ glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
insert_inode_hash(inode);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c3b00ba92ed2..0fb3c01bc557 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -932,7 +932,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
- rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
+ rgd->rd_flags &= ~GFS2_RDF_PREFERRED;
if (rgd->rd_data > sdp->sd_max_rg_data)
sdp->sd_max_rg_data = rgd->rd_data;
spin_lock(&sdp->sd_rindex_spin);
@@ -1185,8 +1185,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
- * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
+ * @gh: the glock holder representing the rgrpd to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
@@ -1194,10 +1194,11 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
- struct gfs2_glock *gl = rgd->rd_gl;
unsigned int length = rgd->rd_length;
struct gfs2_bitmap *bi;
unsigned int x, y;
@@ -1225,21 +1226,18 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
}
}
- if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
- gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
- rgrp_set_bitmap_flags(rgd);
- rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
- rgd->rd_free_clone = rgd->rd_free;
- BUG_ON(rgd->rd_reserved);
- /* max out the rgrp allocation failure point */
- rgd->rd_extfail_pt = rgd->rd_free;
- }
+ gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+ rgrp_set_bitmap_flags(rgd);
+ rgd->rd_flags |= GFS2_RDF_CHECK;
+ rgd->rd_free_clone = rgd->rd_free;
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
+ /* max out the rgrp allocation failure point */
+ rgd->rd_extfail_pt = rgd->rd_free;
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
rgd->rd_bits[0].bi_bh->b_data);
- }
- else if (sdp->sd_args.ar_rgrplvb) {
+ } else if (sdp->sd_args.ar_rgrplvb) {
if (!gfs2_rgrp_lvb_valid(rgd)){
gfs2_consist_rgrpd(rgd);
error = -EIO;
@@ -1257,19 +1255,18 @@ fail:
bi->bi_bh = NULL;
gfs2_assert_warn(sdp, !bi->bi_clone);
}
-
return error;
}
-static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
{
u32 rl_flags;
- if (rgd->rd_flags & GFS2_RDF_UPTODATE)
+ if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags))
return 0;
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
- return gfs2_rgrp_bh_get(rgd);
+ return gfs2_instantiate(gh);
rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
rl_flags &= ~GFS2_RDF_MASK;
@@ -1280,7 +1277,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
rgrp_set_bitmap_flags(rgd);
rgd->rd_free_clone = rgd->rd_free;
- BUG_ON(rgd->rd_reserved);
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
/* max out the rgrp allocation failure point */
rgd->rd_extfail_pt = rgd->rd_free;
rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
@@ -1288,16 +1285,6 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
return 0;
}
-int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
-{
- struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
- return 0;
- return gfs2_rgrp_bh_get(rgd);
-}
-
/**
* gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
* @rgd: The resource group
@@ -1315,6 +1302,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
}
}
+ set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags);
}
int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
@@ -2113,7 +2101,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
gfs2_rgrp_congested(rs->rs_rgd, loops))
goto skip_rgrp;
if (sdp->sd_args.ar_rgrplvb) {
- error = update_rgrp_lvb(rs->rs_rgd);
+ error = update_rgrp_lvb(rs->rs_rgd,
+ &ip->i_rgd_gh);
if (unlikely(error)) {
rgrp_unlock_local(rs->rs_rgd);
gfs2_glock_dq_uninit(&ip->i_rgd_gh);
@@ -2128,8 +2117,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
(loops == 0 && target > rs->rs_rgd->rd_extfail_pt))
goto skip_rgrp;
- if (sdp->sd_args.ar_rgrplvb)
- gfs2_rgrp_bh_get(rs->rs_rgd);
+ if (sdp->sd_args.ar_rgrplvb) {
+ error = gfs2_instantiate(&ip->i_rgd_gh);
+ if (error)
+ goto skip_rgrp;
+ }
/* Get a reservation if we don't already have one */
if (!gfs2_rs_active(rs))
@@ -2215,7 +2207,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
struct gfs2_rgrpd *rgd = rs->rs_rgd;
spin_lock(&rgd->rd_rsspin);
- BUG_ON(rgd->rd_reserved < rs->rs_reserved);
+ GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved);
rgd->rd_reserved -= rs->rs_reserved;
spin_unlock(&rgd->rd_rsspin);
rs->rs_reserved = 0;
@@ -2476,9 +2468,9 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
spin_unlock(&rbm.rgd->rd_rsspin);
goto rgrp_error;
}
- BUG_ON(rbm.rgd->rd_reserved < *nblocks);
- BUG_ON(rbm.rgd->rd_free_clone < *nblocks);
- BUG_ON(rbm.rgd->rd_free < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks);
+ GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks);
rbm.rgd->rd_reserved -= *nblocks;
rbm.rgd->rd_free_clone -= *nblocks;
rbm.rgd->rd_free -= *nblocks;
@@ -2765,8 +2757,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
void rgrp_lock_local(struct gfs2_rgrpd *rgd)
{
- BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) &&
- !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
mutex_lock(&rgd->rd_mutex);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index a6855fd796e0..3e2ca1fb4305 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 6e00d15ef0a8..5b121371508a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1244,8 +1244,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
if (ret)
return SHOULD_NOT_DELETE_DINODE;
- if (test_bit(GIF_INVALID, &ip->i_flags)) {
- ret = gfs2_inode_refresh(ip);
+ if (test_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags)) {
+ ret = gfs2_instantiate(gh);
if (ret)
return SHOULD_NOT_DELETE_DINODE;
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bd6c8e9e49db..a5deb9f86831 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -197,15 +197,14 @@ TRACE_EVENT(gfs2_demote_rq,
/* Promotion/grant of a glock */
TRACE_EVENT(gfs2_promote,
- TP_PROTO(const struct gfs2_holder *gh, int first),
+ TP_PROTO(const struct gfs2_holder *gh),
- TP_ARGS(gh, first),
+ TP_ARGS(gh),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( u64, glnum )
__field( u32, gltype )
- __field( int, first )
__field( u8, state )
),
@@ -213,14 +212,12 @@ TRACE_EVENT(gfs2_promote,
__entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
- __entry->first = first;
__entry->state = glock_trace_state(gh->gh_state);
),
- TP_printk("%u,%u glock %u:%llu promote %s %s",
+ TP_printk("%u,%u glock %u:%llu promote %s",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
(unsigned long long)__entry->glnum,
- __entry->first ? "first": "other",
glock_trace_name(__entry->state))
);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index cf345a86ef67..8241029a2a5d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -454,6 +454,7 @@ void gfs2_consist_inode_i(struct gfs2_inode *ip,
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
function, file, line);
+ gfs2_dump_glock(NULL, ip->i_gl, 1);
gfs2_withdraw(sdp);
}
@@ -475,6 +476,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
" function = %s, file = %s, line = %u\n",
(unsigned long long)rgd->rd_addr,
function, file, line);
+ gfs2_dump_glock(NULL, rgd->rd_gl, 1);
gfs2_withdraw(sdp);
}
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 4a95a92546a0..2a5143246282 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -462,8 +462,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
goto out;
if (S_ISDIR(main_inode->i_mode)) {
- if (fd.entrylength < sizeof(struct hfs_cat_dir))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -483,8 +482,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
} else {
- if (fd.entrylength < sizeof(struct hfs_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index cdf0edeeb278..5beb82652435 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6fef67c2a9f0..d08a8d1d40a4 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -509,8 +509,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
if (type == HFSPLUS_FOLDER) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
@@ -530,8 +529,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
} else if (type == HFSPLUS_FILE) {
struct hfsplus_cat_file *file = &entry.file;
- if (fd->entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_file));
@@ -588,8 +586,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
if (S_ISDIR(main_inode->i_mode)) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
@@ -614,8 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
} else {
struct hfsplus_cat_file *file = &entry.file;
- if (fd.entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
hfsplus_inode_write_fork(inode, &file->data_fork);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0350dc7821bf..51ae6f1eb4a5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
/* default values */
*start = 0;
- *size = i_size_read(sb->s_bdev->bd_inode) >> 9;
+ *size = bdev_nr_sectors(sb->s_bdev);
if (HFSPLUS_SB(sb)->session >= 0) {
struct cdrom_tocentry te;
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index d92c4af3e1b4..281dec8f636b 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -409,10 +409,10 @@ struct bplus_header
__le16 first_free; /* offset from start of header to
first free node in array */
union {
- struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
- subtree pointers */
- struct bplus_leaf_node external[0]; /* (external) 3-word entries giving
- sector runs */
+ /* (internal) 2-word entries giving subtree pointers */
+ DECLARE_FLEX_ARRAY(struct bplus_internal_node, internal);
+ /* (external) 3-word entries giving sector runs */
+ DECLARE_FLEX_ARRAY(struct bplus_leaf_node, external);
} u;
};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cdfb1ae78a3f..49d2e686be74 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1446,8 +1446,8 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct ucounts **ucounts,
- int creat_flags, int page_size_log)
+ vm_flags_t acctflag, int creat_flags,
+ int page_size_log)
{
struct inode *inode;
struct vfsmount *mnt;
@@ -1458,22 +1458,19 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
- *ucounts = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *ucounts = current_ucounts();
- if (user_shm_lock(size, *ucounts)) {
- task_lock(current);
- pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ struct ucounts *ucounts = current_ucounts();
+
+ if (user_shm_lock(size, ucounts)) {
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
current->comm, current->pid);
- task_unlock(current);
- } else {
- *ucounts = NULL;
- return ERR_PTR(-EPERM);
+ user_shm_unlock(size, ucounts);
}
+ return ERR_PTR(-EPERM);
}
file = ERR_PTR(-ENOSPC);
@@ -1498,10 +1495,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode);
out:
- if (*ucounts) {
- user_shm_unlock(size, *ucounts);
- *ucounts = NULL;
- }
return file;
}
diff --git a/fs/inode.c b/fs/inode.c
index ed0cab8a32db..3eba0940ffcf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -428,11 +428,20 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void inode_lru_list_add(struct inode *inode)
+static void __inode_add_lru(struct inode *inode, bool rotate)
{
+ if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ return;
+ if (atomic_read(&inode->i_count))
+ return;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return;
+ if (!mapping_shrinkable(&inode->i_data))
+ return;
+
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- else
+ else if (rotate)
inode->i_state |= I_REFERENCED;
}
@@ -443,16 +452,11 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
- I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
- inode_lru_list_add(inode);
+ __inode_add_lru(inode, false);
}
-
static void inode_lru_list_del(struct inode *inode)
{
-
if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -728,10 +732,6 @@ again:
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
- * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. If the inode has metadata buffers attached to
- * mapping->private_list then try to remove them.
- *
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
@@ -747,31 +747,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
struct inode *inode = container_of(item, struct inode, i_lru);
/*
- * we are inverting the lru lock/inode->i_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock/inode->i_lock here, so use a
+ * trylock. If we fail to get the lock, just skip it.
*/
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
/*
- * Referenced or dirty inodes are still in use. Give them another pass
- * through the LRU as we canot reclaim them now.
+ * Inodes can get referenced, redirtied, or repopulated while
+ * they're already on the LRU, and this can make them
+ * unreclaimable for a while. Remove them lazily here; iput,
+ * sync, or the last page cache deletion will requeue them.
*/
if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
+ (inode->i_state & ~I_REFERENCED) ||
+ !mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
- /* recently referenced inodes get one more pass */
+ /* Recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
+ /*
+ * On highmem systems, mapping_shrinkable() permits dropping
+ * page cache in order to free up struct inodes: lowmem might
+ * be under pressure before the cache inside the highmem zone.
+ */
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
@@ -1638,7 +1646,7 @@ static void iput_final(struct inode *inode)
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- inode_add_lru(inode);
+ __inode_add_lru(inode, true);
spin_unlock(&inode->i_lock);
return;
}
@@ -1782,12 +1790,13 @@ EXPORT_SYMBOL(generic_update_time);
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
-static int update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
if (inode->i_op->update_time)
return inode->i_op->update_time(inode, time, flags);
return generic_update_time(inode, time, flags);
}
+EXPORT_SYMBOL(inode_update_time);
/**
* atime_needs_update - update the access time
@@ -1857,7 +1866,7 @@ void touch_atime(const struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
now = current_time(inode);
- update_time(inode, &now, S_ATIME);
+ inode_update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
sb_end_write(inode->i_sb);
@@ -2002,7 +2011,7 @@ int file_update_time(struct file *file)
if (__mnt_want_write_file(file))
return 0;
- ret = update_time(inode, &now, sync_it);
+ ret = inode_update_time(inode, &now, sync_it);
__mnt_drop_write_file(file);
return ret;
diff --git a/fs/internal.h b/fs/internal.h
index 3cd065c8a66b..7979ff8d168c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,22 +23,11 @@ struct pipe_inode_info;
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-extern int __sync_blockdev(struct block_device *bdev, int wait);
-void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-
-static inline int __sync_blockdev(struct block_device *bdev, int wait)
-{
- return 0;
-}
-static inline void iterate_bdevs(void (*f)(struct block_device *, void *),
- void *arg)
-{
-}
static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
@@ -149,7 +138,6 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 422a7ed6a9bd..afd955d53db9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -14,6 +14,7 @@
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/tracehook.h>
+#include <linux/audit.h>
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
@@ -140,6 +141,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
+static void create_worker_cb(struct callback_head *cb);
static bool io_worker_get(struct io_worker *worker)
{
@@ -174,12 +176,46 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
+static void io_worker_cancel_cb(struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&worker->wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock(&worker->wqe->lock);
+ io_worker_ref_put(wq);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
+}
+
+static bool io_task_worker_match(struct callback_head *cb, void *data)
+{
+ struct io_worker *worker;
+
+ if (cb->func != create_worker_cb)
+ return false;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker == data;
+}
+
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
+ while (1) {
+ struct callback_head *cb = task_work_cancel_match(wq->task,
+ io_task_worker_match, worker);
+
+ if (!cb)
+ break;
+ io_worker_cancel_cb(worker);
+ }
+
+ io_worker_release(worker);
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
@@ -323,8 +359,10 @@ static bool io_queue_worker_create(struct io_worker *worker,
init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
+ clear_bit_unlock(0, &worker->create_state);
return true;
+ }
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
@@ -556,6 +594,8 @@ static int io_wqe_worker(void *data)
snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
+ audit_alloc_kernel(current);
+
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
long ret;
@@ -594,6 +634,7 @@ loop:
io_worker_handle_work(worker);
}
+ audit_free(current);
io_worker_exit(worker);
return 0;
}
@@ -716,11 +757,8 @@ static void io_workqueue_create(struct work_struct *work)
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ if (!io_queue_worker_create(worker, acct, create_worker_cont))
kfree(worker);
- }
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
@@ -1150,17 +1188,9 @@ static void io_wq_exit_workers(struct io_wq *wq)
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
- struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
- acct = io_wqe_get_acct(worker);
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
+ io_worker_cancel_cb(worker);
}
rcu_read_lock();
@@ -1278,7 +1308,9 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
*/
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
- int i, node, prev = 0;
+ int prev[IO_WQ_ACCT_NR];
+ bool first_node = true;
+ int i, node;
BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
@@ -1289,6 +1321,9 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
+ for (i = 0; i < IO_WQ_ACCT_NR; i++)
+ prev[i] = 0;
+
rcu_read_lock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -1297,14 +1332,19 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
acct = &wqe->acct[i];
- prev = max_t(int, acct->max_workers, prev);
+ if (first_node)
+ prev[i] = max_t(int, acct->max_workers, prev[i]);
if (new_count[i])
acct->max_workers = new_count[i];
- new_count[i] = prev;
}
raw_spin_unlock(&wqe->lock);
+ first_node = false;
}
rcu_read_unlock();
+
+ for (i = 0; i < IO_WQ_ACCT_NR; i++)
+ new_count[i] = prev[i];
+
return 0;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bf5c4c533760..41bf37674a49 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -29,6 +29,17 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_for_each_resume(pos, prv) \
+ for (; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+} while (0)
+
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
+static inline void wq_list_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ node->next = list->first;
+ if (!node->next)
+ list->last = node;
+ WRITE_ONCE(list->first, node);
+}
+
static inline void wq_list_cut(struct io_wq_work_list *list,
struct io_wq_work_node *last,
struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
last->next = NULL;
}
+static inline void __wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ list->last->next = to->next;
+ to->next = list->first;
+ INIT_WQ_LIST(list);
+}
+
+static inline bool wq_list_splice(struct io_wq_work_list *list,
+ struct io_wq_work_node *to)
+{
+ if (!wq_list_empty(list)) {
+ __wq_list_splice(list, to);
+ return true;
+ }
+ return false;
+}
+
+static inline void wq_stack_add_head(struct io_wq_work_node *node,
+ struct io_wq_work_node *stack)
+{
+ node->next = stack->next;
+ stack->next = node;
+}
+
static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work_node *node,
struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
wq_list_cut(list, node, prev);
}
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+static inline
+struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
+{
+ struct io_wq_work_node *node = stack->next;
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
- (list)->last = NULL; \
-} while (0)
+ stack->next = node->next;
+ return node;
+}
struct io_wq_work {
struct io_wq_work_node list;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bc18af5e0a93..b07196b4511c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,8 @@
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/security.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -103,11 +105,14 @@
#define IORING_MAX_REG_BUFFERS (1U << 14)
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
+
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+ REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -195,8 +200,10 @@ struct io_rings {
};
enum io_uring_cmd_flags {
- IO_URING_F_NONBLOCK = 1,
- IO_URING_F_COMPLETE_DEFER = 2,
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
@@ -305,26 +312,16 @@ struct io_submit_link {
};
struct io_submit_state {
- struct blk_plug plug;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
struct io_submit_link link;
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_REQ_CACHE_SIZE];
- unsigned int free_reqs;
-
bool plug_started;
-
- /*
- * Batch completion logic
- */
- struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
- unsigned int compl_nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-
- unsigned int ios_left;
+ bool need_plug;
+ unsigned short submit_nr;
+ struct blk_plug plug;
};
struct io_ring_ctx {
@@ -368,6 +365,7 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
@@ -384,7 +382,7 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
+ struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
@@ -399,7 +397,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct eventfd_ctx *cq_ev_fd;
- struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -417,7 +414,7 @@ struct io_ring_ctx {
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head iopoll_list;
+ struct io_wq_work_list iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
@@ -580,7 +577,6 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
- struct io_buffer *kbuf;
};
struct io_open {
@@ -692,11 +688,6 @@ struct io_hardlink {
int flags;
};
-struct io_completion {
- struct file *file;
- u32 cflags;
-};
-
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -710,11 +701,15 @@ struct io_async_msghdr {
struct sockaddr_storage addr;
};
-struct io_async_rw {
- struct iovec fast_iov[UIO_FASTIOV];
- const struct iovec *free_iovec;
+struct io_rw_state {
struct iov_iter iter;
struct iov_iter_state iter_state;
+ struct iovec fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+ struct io_rw_state s;
+ const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -741,9 +736,9 @@ enum {
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_NOWAIT_READ_BIT,
- REQ_F_NOWAIT_WRITE_BIT,
+ REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -784,10 +779,8 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads */
- REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
- /* supports async writes */
- REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
@@ -796,6 +789,8 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
};
struct async_poll {
@@ -852,39 +847,41 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
- /* use only after cleaning per-op data, see io_clean_op() */
- struct io_completion compl;
};
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
-
u16 buf_index;
+ unsigned int flags;
+
+ u64 user_data;
u32 result;
+ u32 cflags;
struct io_ring_ctx *ctx;
- unsigned int flags;
- atomic_t refs;
struct task_struct *task;
- u64 user_data;
- struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
- /* used with ctx->iopoll_list with reads/writes */
- struct list_head inflight_entry;
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ atomic_t refs;
+ struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
+ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
struct io_wq_work work;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
-
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
};
struct io_tctx_node {
@@ -902,12 +899,12 @@ struct io_defer_entry {
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
@@ -915,8 +912,10 @@ struct io_op_def {
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
unsigned needs_async_setup : 1;
- /* should block plug */
- unsigned plug : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
+ /* skip auditing */
+ unsigned audit_skip : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
@@ -930,6 +929,7 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -939,16 +939,19 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -957,15 +960,20 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_POLL_REMOVE] = {
+ .audit_skip = 1,
},
- [IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
@@ -983,18 +991,23 @@ static const struct io_op_def io_op_defs[] = {
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
+ .audit_skip = 1,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
},
- [IORING_OP_ASYNC_CANCEL] = {},
+ [IORING_OP_ASYNC_CANCEL] = {
+ .audit_skip = 1,
+ },
[IORING_OP_LINK_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_CONNECT] = {
@@ -1009,14 +1022,19 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {},
[IORING_OP_CLOSE] = {},
- [IORING_OP_FILES_UPDATE] = {},
- [IORING_OP_STATX] = {},
+ [IORING_OP_FILES_UPDATE] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_STATX] = {
+ .audit_skip = 1,
+ },
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1025,39 +1043,50 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .audit_skip = 1,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .audit_skip = 1,
},
[IORING_OP_OPENAT2] = {
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_REMOVE_BUFFERS] = {
+ .audit_skip = 1,
},
- [IORING_OP_PROVIDE_BUFFERS] = {},
- [IORING_OP_REMOVE_BUFFERS] = {},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
@@ -1080,7 +1109,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags);
+ s32 res, u32 cflags);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
@@ -1095,7 +1124,7 @@ static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1167,6 +1196,12 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+ __io_submit_flush_completions(ctx);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1180,13 +1215,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
__io_req_set_refcount(req, 1);
}
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH 100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct percpu_ref *ref = req->fixed_rsrc_refs;
+ if (ref) {
+ if (ref == &ctx->rsrc_node->refs)
+ ctx->rsrc_cached_refs++;
+ else
+ percpu_ref_put(ref);
+ }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (ctx->rsrc_cached_refs) {
+ percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ ctx->rsrc_cached_refs = 0;
+ }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+ percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- percpu_ref_get(req->fixed_rsrc_refs);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
}
}
@@ -1219,6 +1293,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
return false;
}
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ASYNC_DATA;
+}
+
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
@@ -1230,7 +1309,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
req->result = res;
}
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1242,7 +1321,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
@@ -1255,15 +1334,13 @@ static void io_fallback_req_func(struct work_struct *work)
req->io_task_work.func(req, &locked);
if (locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
-
}
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
@@ -1300,7 +1377,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
@@ -1309,7 +1385,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
- INIT_LIST_HEAD(&ctx->iopoll_list);
+ INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1318,9 +1394,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.free_list);
- INIT_LIST_HEAD(&ctx->locked_free_list);
+ ctx->submit_state.free_list.next = NULL;
+ INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+ INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -1348,21 +1425,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
- return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+ return req->flags & REQ_F_FIXED_FILE;
}
-static void io_req_track_inflight(struct io_kiocb *req)
+static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
@@ -1440,15 +1512,19 @@ static void io_prep_async_link(struct io_kiocb *req)
}
}
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+ struct io_submit_state *state = &req->ctx->submit_state;
+
+ wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
- /* must not take the lock, NULL it as a precaution */
- locked = NULL;
-
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1489,7 +1565,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
}
}
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1503,7 +1579,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
@@ -1536,7 +1612,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
@@ -1606,12 +1682,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1625,8 +1697,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1722,7 +1792,7 @@ static inline void io_get_task_refs(int nr)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_overflow_cqe *ocqe;
@@ -1750,7 +1820,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_uring_cqe *cqe;
@@ -1773,13 +1843,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
/* not as hot to bloat with inlining */
static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
return __io_cqring_fill_event(ctx, user_data, res, cflags);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1798,40 +1868,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
req->link = NULL;
}
}
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
- } else {
- if (!percpu_ref_tryget(&ctx->refs))
- req = NULL;
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
-
- if (req) {
- io_cqring_ev_posted(ctx);
- percpu_ref_put(&ctx->refs);
- }
-}
-
-static inline bool io_req_needs_clean(struct io_kiocb *req)
-{
- return req->flags & IO_REQ_CLEAN_FLAGS;
+ io_cqring_ev_posted(ctx);
}
-static void io_req_complete_state(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- if (io_req_needs_clean(req))
- io_clean_op(req);
req->result = res;
- req->compl.cflags = cflags;
+ req->cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- long res, unsigned cflags)
+ s32 res, u32 cflags)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
@@ -1839,12 +1896,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
__io_req_complete(req, 0, res, 0);
}
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
io_req_complete_post(req, res, 0);
@@ -1878,7 +1935,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &state->free_list);
+ wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
@@ -1887,7 +1944,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
@@ -1896,20 +1952,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, state);
-
- nr = state->free_reqs;
- while (!list_empty(&state->free_list)) {
- struct io_kiocb *req = list_first_entry(&state->free_list,
- struct io_kiocb, inflight_entry);
-
- list_del(&req->inflight_entry);
- state->reqs[nr++] = req;
- if (nr == ARRAY_SIZE(state->reqs))
- break;
- }
-
- state->free_reqs = nr;
- return nr != 0;
+ return !!state->free_list.next;
}
/*
@@ -1918,38 +1961,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ void *reqs[IO_REQ_ALLOC_BATCH];
+ struct io_kiocb *req;
int ret, i;
- BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
- if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
- goto got_req;
+ if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+ return true;
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
+ reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!reqs[0])
+ return false;
ret = 1;
}
- for (i = 0; i < ret; i++)
- io_preinit_req(state->reqs[i], ctx);
- state->free_reqs = ret;
-got_req:
- state->free_reqs--;
- return state->reqs[state->free_reqs];
+ percpu_ref_get_many(&ctx->refs, ret);
+ for (i = 0; i < ret; i++) {
+ req = reqs[i];
+
+ io_preinit_req(req, ctx);
+ wq_stack_add_head(&req->comp_list, &state->free_list);
+ }
+ return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(!ctx->submit_state.free_list.next))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
}
static inline void io_put_file(struct file *file)
@@ -1958,35 +2017,28 @@ static inline void io_put_file(struct file *file)
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
- if (io_req_needs_clean(req))
+ if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
}
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
-
- percpu_ref_put(&ctx->refs);
}
static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2072,47 +2124,45 @@ static bool io_disarm_next(struct io_kiocb *req)
return posted;
}
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ spin_lock(&ctx->completion_lock);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(req->ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (posted)
+ io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+ return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & IO_DISARM_MASK) {
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(req->ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- }
+ if (unlikely(req->flags & IO_DISARM_MASK))
+ __io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- return __io_req_find_next(req);
-}
-
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (*locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
@@ -2129,7 +2179,7 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
- if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ if (!tctx->task_list.first && locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
@@ -2192,8 +2242,9 @@ static void io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (!task_work_add(tsk, &tctx->task_work, notify)) {
- wake_up_process(tsk);
+ if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+ if (notify == TWA_NONE)
+ wake_up_process(tsk);
return;
}
@@ -2271,77 +2322,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
io_free_req(req);
}
-struct req_batch {
- struct task_struct *task;
- int task_refs;
- int ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
+ __must_hold(&ctx->uring_lock)
{
- rb->task_refs = 0;
- rb->ctx_refs = 0;
- rb->task = NULL;
-}
+ struct task_struct *task = NULL;
+ int task_refs = 0;
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- if (rb->ctx_refs)
- percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
-}
+ do {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
- struct io_submit_state *state)
-{
- io_queue_next(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
- if (req->task != rb->task) {
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
- }
- rb->task_refs++;
- rb->ctx_refs++;
+ io_req_put_rsrc_locked(req, ctx);
+ io_queue_next(req);
+ io_dismantle_req(req);
- if (state->free_reqs != ARRAY_SIZE(state->reqs))
- state->reqs[state->free_reqs++] = req;
- else
- list_add(&req->inflight_entry, &state->free_list);
+ if (req->task != task) {
+ if (task)
+ io_put_task(task, task_refs);
+ task = req->task;
+ task_refs = 0;
+ }
+ task_refs++;
+ node = req->comp_list.next;
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ } while (node);
+
+ if (task)
+ io_put_task(task, task_refs);
}
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
- int i, nr = state->compl_nr;
- struct req_batch rb;
spin_lock(&ctx->completion_lock);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
+ wq_list_for_each(node, prev, &state->compl_reqs) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
__io_cqring_fill_event(ctx, req->user_data, req->result,
- req->compl.cflags);
+ req->cflags);
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_init_req_batch(&rb);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_req_free_batch_finish(ctx, &rb);
- state->compl_nr = 0;
+ io_free_batch_list(ctx, state->compl_reqs.first);
+ INIT_WQ_LIST(&state->compl_reqs);
}
/*
@@ -2401,12 +2437,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
- struct io_buffer *kbuf;
-
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- return io_put_kbuf(req, kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static inline bool io_run_task_work(void)
@@ -2420,50 +2453,22 @@ static inline bool io_run_task_work(void)
return false;
}
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
-{
- struct req_batch rb;
- struct io_kiocb *req;
-
- /* order with ->result store in io_complete_rw_iopoll() */
- smp_rmb();
-
- io_init_req_batch(&rb);
- while (!list_empty(done)) {
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- io_put_rw_kbuf(req));
- (*nr_events)++;
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
- struct io_kiocb *req, *tmp;
- LIST_HEAD(done);
- bool spin;
+ struct io_wq_work_node *pos, *start, *prev;
+ unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ DEFINE_IO_COMP_BATCH(iob);
+ int nr_events = 0;
/*
* Only spin for completions if we don't have multiple devices hanging
- * off our complete list, and we're under the requested amount.
+ * off our complete list.
*/
- spin = !ctx->poll_multi_queue && *nr_events < min;
+ if (ctx->poll_multi_queue || force_nonspin)
+ poll_flags |= BLK_POLL_ONESHOT;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ wq_list_for_each(pos, start, &ctx->iopoll_list) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct kiocb *kiocb = &req->rw.kiocb;
int ret;
@@ -2472,47 +2477,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
- continue;
- }
- if (!list_empty(&done))
+ if (READ_ONCE(req->iopoll_completed))
break;
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
if (unlikely(ret < 0))
return ret;
else if (ret)
- spin = false;
+ poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ if (!rq_list_empty(iob.req_list) ||
+ READ_ONCE(req->iopoll_completed))
+ break;
}
- if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ if (!rq_list_empty(iob.req_list))
+ iob.complete(&iob);
+ else if (!pos)
+ return 0;
- return 0;
+ prev = start;
+ wq_list_for_each_resume(pos, prev) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+
+ /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+ if (!smp_load_acquire(&req->iopoll_completed))
+ break;
+ __io_cqring_fill_event(ctx, req->user_data, req->result,
+ io_put_rw_kbuf(req));
+ nr_events++;
+ }
+
+ if (unlikely(!nr_events))
+ return 0;
+
+ io_commit_cqring(ctx);
+ io_cqring_ev_posted_iopoll(ctx);
+ pos = start ? start->next : ctx->iopoll_list.first;
+ wq_list_cut(&ctx->iopoll_list, prev, start);
+ io_free_batch_list(ctx, pos);
+ return nr_events;
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->iopoll_list)) {
- unsigned int nr_events = 0;
-
- io_do_iopoll(ctx, &nr_events, 0);
-
+ while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
- if (nr_events == 0)
+ if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
@@ -2559,7 +2579,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
u32 tail = ctx->cached_cq_tail;
mutex_unlock(&ctx->uring_lock);
@@ -2568,11 +2588,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
- list_empty(&ctx->iopoll_list))
+ wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
- } while (!ret && nr_events < min && !need_resched());
+ ret = io_do_iopoll(ctx, !min);
+ if (ret < 0)
+ break;
+ nr_events += ret;
+ ret = 0;
+ } while (nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -2597,9 +2621,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *rw = req->async_data;
- if (!rw)
+ if (!req_has_async_data(req))
return !io_req_prep_async(req);
- iov_iter_restore(&rw->iter, &rw->iter_state);
+ iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
return true;
}
@@ -2643,7 +2667,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (req->rw.kiocb.ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result) {
+ if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
@@ -2658,16 +2682,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
unsigned int cflags = io_put_rw_kbuf(req);
- long res = req->result;
+ int res = req->result;
if (*locked) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
io_req_complete_state(req, res, cflags);
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
+ io_req_add_compl_list(req);
} else {
io_req_complete_post(req, res, cflags);
}
@@ -2681,7 +2700,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
__io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2692,7 +2711,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
io_req_task_work_add(req);
}
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2703,12 +2722,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
req->flags |= REQ_F_REISSUE;
return;
}
+ req->result = res;
}
- WRITE_ONCE(req->result, res);
- /* order with io_iopoll_complete() checking ->result */
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
+ /* order with io_iopoll_complete() checking ->iopoll_completed */
+ smp_store_release(&req->iopoll_completed, 1);
}
/*
@@ -2717,13 +2735,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- const bool in_async = io_wq_current_is_worker();
+ const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(in_async))
+ if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
@@ -2731,23 +2749,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- unsigned int queue_num0, queue_num1;
-
- list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
- if (list_req->file != req->file) {
+ list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+ comp_list);
+ if (list_req->file != req->file)
ctx->poll_multi_queue = true;
- } else {
- queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
- queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
- if (queue_num0 != queue_num1)
- ctx->poll_multi_queue = true;
- }
}
/*
@@ -2755,11 +2765,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
- if (unlikely(in_async)) {
+ if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
@@ -2784,10 +2794,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
- umode_t mode = file_inode(file)->i_mode;
-
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2807,28 +2815,32 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
+ return file->f_mode & FMODE_NOWAIT;
+}
- if (!(file->f_mode & FMODE_NOWAIT))
- return false;
-
- if (rw == READ)
- return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+ unsigned int res = 0;
- return file->f_op->write_iter != NULL;
+ if (S_ISREG(mode))
+ res |= FFS_ISREG;
+ if (__io_file_supports_nowait(file, mode))
+ res |= FFS_NOWAIT;
+ return res;
}
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
{
- if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
- return true;
- else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
- return true;
-
- return __io_file_supports_nowait(req->file, rw);
+ return req->flags & REQ_F_SUPPORT_NOWAIT;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2836,16 +2848,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
- req->flags |= REQ_F_ISREG;
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = file->f_pos;
}
- kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+ kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
return ret;
@@ -2856,22 +2867,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
*/
if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
req->flags |= REQ_F_NOWAIT;
- ioprio = READ_ONCE(sqe->ioprio);
- if (ioprio) {
- ret = ioprio_check_cap(ioprio);
- if (ret)
- return ret;
-
- kiocb->ki_ioprio = ioprio;
- } else
- kiocb->ki_ioprio = get_current_ioprio();
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) ||
- !kiocb->ki_filp->f_op->iopoll)
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
@@ -2883,12 +2883,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_complete = io_complete_rw;
}
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- req->imu = NULL;
- io_req_set_rsrc_node(req);
+ ioprio = READ_ONCE(sqe->ioprio);
+ if (ioprio) {
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ return ret;
+
+ kiocb->ki_ioprio = ioprio;
+ } else {
+ kiocb->ki_ioprio = get_current_ioprio();
}
+ req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2912,7 +2918,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- kiocb->ki_complete(kiocb, ret, 0);
+ kiocb->ki_complete(kiocb, ret);
}
}
@@ -2923,7 +2929,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_async_rw *io = req->async_data;
/* add previously done IO, if any */
- if (io && io->bytes_done > 0) {
+ if (req_has_async_data(req) && io->bytes_done > 0) {
if (ret < 0)
ret = io->bytes_done;
else
@@ -2946,7 +2952,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
struct io_ring_ctx *ctx = req->ctx;
req_set_fail(req);
- if (!(issue_flags & IO_URING_F_NONBLOCK)) {
+ if (issue_flags & IO_URING_F_UNLOCKED) {
mutex_lock(&ctx->uring_lock);
__io_req_complete(req, issue_flags, ret, cflags);
mutex_unlock(&ctx->uring_lock);
@@ -3017,13 +3023,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
if (likely(!imu)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
+ io_req_set_rsrc_node(req, ctx);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3050,10 +3058,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
}
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, struct io_buffer *kbuf,
- bool needs_lock)
+ int bgid, unsigned int issue_flags)
{
+ struct io_buffer *kbuf = req->kbuf;
struct io_buffer *head;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
@@ -3074,34 +3083,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
}
if (*len > kbuf->len)
*len = kbuf->len;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ req->kbuf = kbuf;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
-
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_buffer *kbuf;
u16 bgid;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ kbuf = io_buffer_select(req, len, bgid, issue_flags);
if (IS_ERR(kbuf))
return kbuf;
- req->rw.addr = (u64) (unsigned long) kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
@@ -3117,7 +3124,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3127,7 +3134,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
@@ -3139,7 +3146,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3148,12 +3155,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf;
+ struct io_buffer *kbuf = req->kbuf;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len;
return 0;
@@ -3163,52 +3169,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return io_compat_import(req, iov, needs_lock);
+ return io_compat_import(req, iov, issue_flags);
#endif
- return __io_iov_buffer_select(req, iov, needs_lock);
+ return __io_iov_buffer_select(req, iov, issue_flags);
}
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_rw_state *s,
+ unsigned int issue_flags)
{
- void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ struct iov_iter *iter = &s->iter;
u8 opcode = req->opcode;
+ struct iovec *iovec;
+ void __user *buf;
+ size_t sqe_len;
ssize_t ret;
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- *iovec = NULL;
- return io_import_fixed(req, rw, iter);
- }
+ BUILD_BUG_ON(ERR_PTR(0) != NULL);
+
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
+ return ERR_PTR(io_import_fixed(req, rw, iter));
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
+ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+ return ERR_PTR(-EINVAL);
+
+ buf = u64_to_user_ptr(req->rw.addr);
+ sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
if (IS_ERR(buf))
- return PTR_ERR(buf);
+ return ERR_CAST(buf);
req->rw.len = sqe_len;
}
- ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
- *iovec = NULL;
- return ret;
+ ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+ return ERR_PTR(ret);
}
+ iovec = s->fast_iov;
if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, *iovec, needs_lock);
+ ret = io_iov_buffer_select(req, iovec, issue_flags);
if (!ret)
- iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
- *iovec = NULL;
- return ret;
+ iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+ return ERR_PTR(ret);
}
- return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ERR_PTR(ret);
+ return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct io_rw_state *s,
+ unsigned int issue_flags)
+{
+ *iovec = __io_import_iovec(rw, req, s, issue_flags);
+ if (unlikely(IS_ERR(*iovec)))
+ return PTR_ERR(*iovec);
+
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ return 0;
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3233,7 +3259,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
*/
if (kiocb->ki_flags & IOCB_HIPRI)
return -EOPNOTSUPP;
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+ !(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
while (iov_iter_count(iter)) {
@@ -3279,7 +3306,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
{
struct io_async_rw *rw = req->async_data;
- memcpy(&rw->iter, iter, sizeof(*iter));
+ memcpy(&rw->s.iter, iter, sizeof(*iter));
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
@@ -3288,33 +3315,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!iovec) {
unsigned iov_off = 0;
- rw->iter.iov = rw->fast_iov;
+ rw->s.iter.iov = rw->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
- rw->iter.iov += iov_off;
+ rw->s.iter.iov += iov_off;
}
- if (rw->fast_iov != fast_iov)
- memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+ if (rw->s.fast_iov != fast_iov)
+ memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- return req->async_data == NULL;
+ if (req->async_data) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ return false;
+ }
+ return true;
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force)
+ struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (!req->async_data) {
+ if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
if (io_alloc_async_data(req)) {
@@ -3322,10 +3352,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
return -ENOMEM;
}
- io_req_map_rw(req, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
}
return 0;
}
@@ -3333,10 +3363,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
- struct iovec *iov = iorw->fast_iov;
+ struct iovec *iov;
int ret;
- ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ /* submission path, ->uring_lock should already be taken */
+ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0))
return ret;
@@ -3344,7 +3375,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
@@ -3352,11 +3382,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- return io_prep_rw(req, sqe, READ);
+ return io_prep_rw(req, sqe);
}
/*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
* This gets called when the page is unlocked, and we generally expect that to
* happen when the page IO is completed and the page is now uptodate. This will
@@ -3428,7 +3458,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
- if (req->file->f_op->read_iter)
+ if (likely(req->file->f_op->read_iter))
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
return loop_rw_iter(READ, req, iter);
@@ -3444,43 +3474,40 @@ static bool need_read_all(struct io_kiocb *req)
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
+ struct io_async_rw *rw;
ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ rw = req->async_data;
+ s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
* need to make this conditional.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
- } else {
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
}
- req->result = iov_iter_count(iter);
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req))) {
+ ret = io_setup_async_rw(req, iovec, s, true);
+ return ret ?: -EAGAIN;
+ }
kiocb->ki_flags |= IOCB_NOWAIT;
-
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, READ)) {
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- return ret ?: -EAGAIN;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
}
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
@@ -3489,7 +3516,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
@@ -3502,7 +3529,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+ } else if (ret == req->result || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3513,22 +3540,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
- ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ ret2 = io_setup_async_rw(req, iovec, s, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
+ s = &rw->s;
/*
* Now use our persistent iterator and state, if we aren't already.
* We've restored and mapped the iter to match.
*/
- if (iter != &rw->iter) {
- iter = &rw->iter;
- state = &rw->iter_state;
- }
do {
/*
@@ -3536,11 +3560,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
- iov_iter_advance(iter, ret);
- if (!iov_iter_count(iter))
+ iov_iter_advance(&s->iter, ret);
+ if (!iov_iter_count(&s->iter))
break;
rw->bytes_done += ret;
- iov_iter_save_state(iter, state);
+ iov_iter_save_state(&s->iter, &s->iter_state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
@@ -3554,12 +3578,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
} while (ret > 0);
done:
kiocb_done(kiocb, ret, issue_flags);
@@ -3574,47 +3598,46 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- return io_prep_rw(req, sqe, WRITE);
+ req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
+ return io_prep_rw(req, sqe);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
ssize_t ret, ret2;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
- iov_iter_restore(iter, state);
- iovec = NULL;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
+ } else {
+ struct io_async_rw *rw = req->async_data;
+
+ s = &rw->s;
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- req->result = iov_iter_count(iter);
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
- kiocb->ki_flags |= IOCB_NOWAIT;
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req)))
+ goto copy_iov;
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, WRITE))
- goto copy_iov;
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
+ goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ }
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret))
@@ -3634,10 +3657,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
}
kiocb->ki_flags |= IOCB_WRITE;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, iter);
+ if (likely(req->file->f_op->write_iter))
+ ret2 = call_write_iter(req->file, kiocb, &s->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, iter);
+ ret2 = loop_rw_iter(WRITE, req, &s->iter);
else
ret2 = -EINVAL;
@@ -3657,14 +3680,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
done:
kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
- iov_iter_restore(iter, state);
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
}
out_free:
@@ -3800,7 +3823,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
return 0;
}
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = &req->mkdir;
int ret;
@@ -3849,7 +3872,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_symlink *sl = &req->symlink;
int ret;
@@ -3899,7 +3922,7 @@ static int io_linkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_hardlink *lnk = &req->hardlink;
int ret;
@@ -4318,9 +4341,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4333,7 +4356,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4405,9 +4428,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head, *list;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
@@ -4423,7 +4446,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4756,8 +4779,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4916,23 +4940,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
- struct io_buffer *kbuf;
- kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
- if (IS_ERR(kbuf))
- return kbuf;
-
- sr->kbuf = kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
- return kbuf;
+ return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
}
static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
{
- return io_put_kbuf(req, req->sr_msg.kbuf);
+ return io_put_kbuf(req, req->kbuf);
}
static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4980,8 +4997,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4989,7 +5007,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
}
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5041,7 +5059,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
@@ -5172,7 +5190,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req->async_data) {
+ if (req_has_async_data(req)) {
io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
@@ -5188,7 +5206,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->async_data)
+ if (req_has_async_data(req))
return -EAGAIN;
if (io_alloc_async_data(req)) {
ret = -ENOMEM;
@@ -5348,16 +5366,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
-{
- bool done;
-
- done = __io_poll_complete(req, mask);
- io_commit_cqring(req->ctx);
- return done;
-}
-
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5479,7 +5487,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
req_ref_get(req);
poll->wait.private = req;
+
*poll_ptr = poll;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->flags |= REQ_F_ASYNC_DATA;
}
pt->nr_entries++;
@@ -5603,17 +5614,13 @@ static int io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int rw;
- if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
+ if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ return IO_APOLL_ABORTED;
if (def->pollin) {
- rw = READ;
mask |= POLLIN | POLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5621,14 +5628,9 @@ static int io_arm_poll_handler(struct io_kiocb *req)
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
mask &= ~POLLIN;
} else {
- rw = WRITE;
mask |= POLLOUT | POLLWRNORM;
}
- /* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_nowait(req, rw))
- return IO_APOLL_ABORTED;
-
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return IO_APOLL_ABORTED;
@@ -5689,8 +5691,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5844,7 +5846,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- done = io_poll_complete(req, mask);
+ done = __io_poll_complete(req, mask);
+ io_commit_cqring(req->ctx);
}
spin_unlock(&ctx->completion_lock);
@@ -5920,7 +5923,10 @@ err:
static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
- req_set_fail(req);
+ struct io_timeout_data *data = req->async_data;
+
+ if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+ req_set_fail(req);
io_req_complete_post(req, -ETIME, 0);
}
@@ -6126,7 +6132,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+ IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6137,7 +6144,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
- if (!req->async_data && io_alloc_async_data(req))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
@@ -6294,6 +6303,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 sqe_addr = req->cancel.addr;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_tctx_node *node;
int ret;
@@ -6302,7 +6312,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
goto done;
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
@@ -6311,7 +6321,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
if (ret != -ENOENT)
break;
}
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
done:
if (ret < 0)
req_set_fail(req);
@@ -6338,6 +6348,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6347,10 +6358,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.tags = 0;
up.resv = 0;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret < 0)
req_set_fail(req);
@@ -6446,7 +6457,7 @@ static int io_req_prep_async(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (WARN_ON_ONCE(req->async_data))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -EAGAIN;
@@ -6478,68 +6489,39 @@ static u32 io_get_sequence(struct io_kiocb *req)
return seq;
}
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
{
- struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
- u32 seq;
-
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- return true;
- }
-
- /*
- * If we need to drain a request in the middle of a link, drain the
- * head request and the next request/link after the current link.
- * Considering sequential execution of links, IOSQE_IO_DRAIN will be
- * maintained for every request of our link.
- */
- if (ctx->drain_next) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = false;
- }
- /* not interested in head, start from the first linked */
- io_for_each_link(pos, req->link) {
- if (pos->flags & REQ_F_IO_DRAIN) {
- ctx->drain_next = true;
- req->flags |= REQ_F_IO_DRAIN;
- break;
- }
- }
+ u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
- if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN))) {
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+queue:
ctx->drain_active = false;
- return false;
+ io_req_task_queue(req);
+ return;
}
- seq = io_get_sequence(req);
- /* Still a chance to pass the sequence check */
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return false;
-
ret = io_req_prep_async(req);
- if (ret)
- goto fail;
+ if (ret) {
+fail:
+ io_req_complete_failed(req, ret);
+ return;
+ }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
-fail:
- io_req_complete_failed(req, ret);
- return true;
+ goto fail;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req, NULL);
- return true;
+ goto queue;
}
trace_io_uring_defer(ctx, req, req->user_data);
@@ -6547,23 +6529,13 @@ fail:
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
- return true;
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- kfree((void *)(unsigned long)req->rw.addr);
- break;
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- kfree(req->sr_msg.kbuf);
- break;
- }
+ kfree(req->kbuf);
+ req->kbuf = NULL;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6628,19 +6600,24 @@ static void io_clean_op(struct io_kiocb *req)
}
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
-
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
const struct cred *creds = NULL;
int ret;
- if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_entry(req->opcode);
+
switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req, issue_flags);
@@ -6756,13 +6733,16 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
break;
}
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_exit(!ret, ret);
+
if (creds)
revert_creds(creds);
if (ret)
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req);
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req, issue_flags);
return 0;
}
@@ -6778,6 +6758,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ unsigned int issue_flags = IO_URING_F_UNLOCKED;
+ bool needs_poll = false;
struct io_kiocb *timeout;
int ret = 0;
@@ -6792,23 +6774,42 @@ static void io_wq_submit_work(struct io_wq_work *work)
io_queue_linked_timeout(timeout);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL)
- ret = -ECANCELED;
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+ io_req_task_queue_fail(req, -ECANCELED);
+ return;
+ }
- if (!ret) {
- do {
- ret = io_issue_sqe(req, 0);
- /*
- * We can get EAGAIN for polled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ bool opcode_poll = def->pollin || def->pollout;
+
+ if (opcode_poll && file_can_poll(req->file)) {
+ needs_poll = true;
+ issue_flags |= IO_URING_F_NONBLOCK;
+ }
}
+ do {
+ ret = io_issue_sqe(req, issue_flags);
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * We can get EAGAIN for iopolled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (!needs_poll) {
+ cond_resched();
+ continue;
+ }
+
+ if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ return;
+ /* aborted or ready, in either case retry blocking */
+ needs_poll = false;
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+ } while (1);
+
/* avoid locking problems by failing it from a clean context */
if (ret)
io_req_task_queue_fail(req, ret);
@@ -6832,12 +6833,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_nowait(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_nowait(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
+ file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
@@ -6854,8 +6850,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
- io_req_set_rsrc_node(req);
+ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ io_req_set_rsrc_node(req, ctx);
return file;
}
@@ -6947,67 +6943,62 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+ switch (io_arm_poll_handler(req)) {
+ case IO_APOLL_READY:
+ io_req_task_queue(req);
+ break;
+ case IO_APOLL_ABORTED:
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req, NULL);
+ break;
+ }
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
int ret;
-issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ io_req_add_compl_list(req);
+ return;
+ }
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
- return;
- }
-
linked_timeout = io_prep_linked_timeout(req);
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req)) {
- case IO_APOLL_READY:
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- goto issue_sqe;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ io_queue_sqe_arm_apoll(req);
} else {
io_req_complete_failed(req, ret);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (unlikely(req->ctx->drain_active) && io_drain_req(req))
- return;
-
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
- __io_queue_sqe(req);
- } else if (req->flags & REQ_F_FAIL) {
+ if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
+ } else if (unlikely(req->ctx->drain_active)) {
+ io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
@@ -7018,6 +7009,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
}
}
+static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+ __io_queue_sqe(req);
+ else
+ io_queue_sqe_fallback(req);
+}
+
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7027,9 +7027,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (likely(!ctx->restricted))
- return true;
-
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -7044,16 +7041,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
return true;
}
+static void io_init_req_drain(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *head = ctx->submit_state.link.head;
+
+ ctx->drain_active = true;
+ if (head) {
+ /*
+ * If we need to drain a request in the middle of a link, drain
+ * the head request and the next request/link after the current
+ * link. Considering sequential execution of links,
+ * IOSQE_IO_DRAIN will be maintained for every request of our
+ * link.
+ */
+ head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ ctx->drain_next = true;
+ }
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state;
unsigned int sqe_flags;
- int personality, ret = 0;
+ int personality;
+ u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = READ_ONCE(sqe->opcode);
+ req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
@@ -7061,49 +7077,70 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->fixed_rsrc_refs = NULL;
req->task = current;
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
- if (unlikely(req->opcode >= IORING_OP_LAST))
+ if (unlikely(opcode >= IORING_OP_LAST)) {
+ req->opcode = 0;
return -EINVAL;
- if (!io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ }
+ if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+ /* enforce forwards compatibility on users */
+ if (sqe_flags & ~SQE_VALID_FLAGS)
+ return -EINVAL;
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_IO_DRAIN)
+ io_init_req_drain(req);
+ }
+ if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+ if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+ return -EACCES;
+ /* knock it to the slow queue path, will be drained there */
+ if (ctx->drain_active)
+ req->flags |= REQ_F_FORCE_ASYNC;
+ /* if there is no link, we're at "next" request and need to drain */
+ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+ ctx->drain_next = false;
+ ctx->drain_active = true;
+ req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ }
+ }
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
- if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
- ctx->drain_active = true;
+ if (io_op_defs[opcode].needs_file) {
+ struct io_submit_state *state = &ctx->submit_state;
+
+ /*
+ * Plug now if we have more than 2 IO left after this, and the
+ * target is potentially a read/write to block based storage.
+ */
+ if (state->need_plug && io_op_defs[opcode].plug) {
+ state->plug_started = true;
+ state->need_plug = false;
+ blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ }
+
+ req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
+ (sqe_flags & IOSQE_FIXED_FILE));
+ if (unlikely(!req->file))
+ return -EBADF;
+ }
personality = READ_ONCE(sqe->personality);
if (personality) {
+ int ret;
+
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
return -EINVAL;
get_cred(req->creds);
+ ret = security_uring_override_creds(req->creds);
+ if (ret) {
+ put_cred(req->creds);
+ return ret;
+ }
req->flags |= REQ_F_CREDS;
}
- state = &ctx->submit_state;
-
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
- if (io_op_defs[req->opcode].needs_file) {
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- ret = -EBADF;
- }
-
- state->ios_left--;
- return ret;
+ return io_req_prep(req, sqe);
}
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7115,7 +7152,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
-fail_req:
+ trace_io_uring_req_failed(sqe, ret);
+
/* fail even hard links since we don't submit */
if (link->head) {
/*
@@ -7138,10 +7176,6 @@ fail_req:
return ret;
}
req_fail_link_node(req, ret);
- } else {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
}
/* don't need @sqe from now on */
@@ -7171,33 +7205,32 @@ fail_req:
link->last->link = req;
link->last = req;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ return 0;
/* last request of a link, enqueue the link */
- if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- link->head = NULL;
- io_queue_sqe(head);
- }
- } else {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- } else {
- io_queue_sqe(req);
- }
+ link->head = NULL;
+ req = head;
+ } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ link->head = req;
+ link->last = req;
+ return 0;
}
+ io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state,
- struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->compl_nr)
- io_submit_flush_completions(ctx);
+ /* flush only after queuing links as they can generate completions */
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
@@ -7209,7 +7242,8 @@ static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
- state->ios_left = max_ios;
+ state->need_plug = max_ios > 2;
+ state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
@@ -7261,45 +7295,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
+ unsigned int entries = io_sqring_entries(ctx);
int submitted = 0;
+ if (unlikely(!entries))
+ return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
- if (!percpu_ref_tryget_many(&ctx->refs, nr))
- return -EAGAIN;
+ nr = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(nr);
io_submit_state_start(&ctx->submit_state, nr);
- while (submitted < nr) {
+ do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- req = io_alloc_req(ctx);
- if (unlikely(!req)) {
+ if (unlikely(!io_alloc_req_refill(ctx))) {
if (!submitted)
submitted = -EAGAIN;
break;
}
+ req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (io_submit_sqe(ctx, req, sqe))
break;
- }
+ } while (submitted < nr);
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
int unused = nr - ref_used;
current->io_uring->cached_refs += unused;
- percpu_ref_put_many(&ctx->refs, unused);
}
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7338,16 +7372,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!list_empty(&ctx->iopoll_list) || to_submit) {
- unsigned nr_events = 0;
+ if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ if (!wq_list_empty(&ctx->iopoll_list))
+ io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7367,7 +7400,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
return ret;
}
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
@@ -7410,6 +7443,8 @@ static int io_sq_thread(void *data)
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
+ audit_alloc_kernel(current);
+
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
@@ -7424,7 +7459,7 @@ static int io_sq_thread(void *data)
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
- if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
@@ -7445,7 +7480,7 @@ static int io_sq_thread(void *data)
io_ring_set_wakeup_flag(ctx);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
+ !wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
@@ -7475,6 +7510,8 @@ static int io_sq_thread(void *data)
io_run_task_work();
mutex_unlock(&sqd->lock);
+ audit_free(current);
+
complete(&sqd->exited);
do_exit(0);
}
@@ -7621,7 +7658,7 @@ static void io_free_page_table(void **table, size_t size)
kfree(table);
}
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
{
unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
size_t init_size = size;
@@ -7650,7 +7687,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
@@ -7696,10 +7733,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
+ __must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+ io_rsrc_refs_drop(ctx);
+
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
@@ -7727,7 +7767,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+ struct io_ring_ctx *ctx)
{
int ret;
@@ -7783,9 +7824,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
@@ -8353,12 +8394,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8399,7 +8440,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret)
fput(file);
return ret;
@@ -8409,11 +8450,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
int ret, i;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -8439,7 +8481,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
return ret;
}
@@ -8555,8 +8597,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8603,8 +8645,8 @@ void __io_uring_free(struct task_struct *tsk)
tsk->io_uring = NULL;
}
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
@@ -8627,6 +8669,10 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
bool attached;
+ ret = security_uring_sqpoll();
+ if (ret)
+ return ret;
+
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
@@ -9215,29 +9261,25 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
}
}
-static void io_req_cache_free(struct list_head *list)
-{
- struct io_kiocb *req, *nxt;
-
- list_for_each_entry_safe(req, nxt, list, inflight_entry) {
- list_del(&req->inflight_entry);
- kmem_cache_free(req_cachep, req);
- }
-}
-
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
+ int nr = 0;
mutex_lock(&ctx->uring_lock);
+ io_flush_cached_locked_reqs(ctx, state);
- if (state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
- state->free_reqs = 0;
- }
+ while (state->free_list.next) {
+ struct io_wq_work_node *node;
+ struct io_kiocb *req;
- io_flush_cached_locked_reqs(ctx, state);
- io_req_cache_free(&state->free_list);
+ node = wq_stack_extract(&state->free_list);
+ req = container_of(node, struct io_kiocb, comp_list);
+ kmem_cache_free(req_cachep, req);
+ nr++;
+ }
+ if (nr)
+ percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -9247,7 +9289,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9256,6 +9298,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
@@ -9279,6 +9322,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
+ flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9309,7 +9353,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->poll_wait, wait);
+ poll_wait(file, &ctx->cq_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -9356,7 +9400,7 @@ struct io_tctx_exit {
struct io_ring_ctx *ctx;
};
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
@@ -9371,14 +9415,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9407,6 +9451,8 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
+ io_req_caches_free(ctx);
+
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
@@ -9433,7 +9479,6 @@ static void io_ring_exit_work(struct work_struct *work)
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
- wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
@@ -9447,8 +9492,8 @@ static void io_ring_exit_work(struct work_struct *work)
}
/* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
@@ -9470,7 +9515,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
return canceled != 0;
}
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
@@ -9532,8 +9577,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
return ret;
}
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
@@ -9558,7 +9604,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
return true;
}
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
@@ -9582,9 +9628,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9608,7 +9654,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!list_empty_careful(&ctx->iopoll_list)) {
+ while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
@@ -9683,7 +9729,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9706,7 +9752,7 @@ static void io_uring_del_tctx_node(unsigned long index)
kfree(node);
}
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
@@ -9733,7 +9779,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_drop_tctx_refs(struct task_struct *task)
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
unsigned int refs = tctx->cached_refs;
@@ -9749,7 +9795,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
* Find any io_uring ctx that this task has registered or done IO on, and cancel
* requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
*/
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+ struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
@@ -9842,7 +9889,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
#ifdef CONFIG_MMU
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
@@ -10027,7 +10074,7 @@ out_fput:
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
@@ -10059,11 +10106,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
{
struct io_sq_data *sq = NULL;
+ struct io_overflow_cqe *ocqe;
+ struct io_rings *r = ctx->rings;
+ unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+ unsigned int sq_head = READ_ONCE(r->sq.head);
+ unsigned int sq_tail = READ_ONCE(r->sq.tail);
+ unsigned int cq_head = READ_ONCE(r->cq.head);
+ unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int sq_entries, cq_entries;
bool has_lock;
- int i;
+ unsigned int i;
+
+ /*
+ * we may get imprecise sqe and cqe info if uring is actively running
+ * since we get cached_sq_head and cached_cq_tail without uring_lock
+ * and sq_tail and cq_head are changed by userspace. But it's ok since
+ * we usually use these info when it is stuck.
+ */
+ seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
+ seq_printf(m, "SqHead:\t%u\n", sq_head);
+ seq_printf(m, "SqTail:\t%u\n", sq_tail);
+ seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+ seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+ seq_printf(m, "CqHead:\t%u\n", cq_head);
+ seq_printf(m, "CqTail:\t%u\n", cq_tail);
+ seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+ for (i = 0; i < sq_entries; i++) {
+ unsigned int entry = i + sq_head;
+ unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ struct io_uring_sqe *sqe;
+
+ if (sq_idx > sq_mask)
+ continue;
+ sqe = &ctx->sq_sqes[sq_idx];
+ seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+ sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+ sqe->user_data);
+ }
+ seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+ cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+ for (i = 0; i < cq_entries; i++) {
+ unsigned int entry = i + cq_head;
+ struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags);
+ }
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10105,7 +10200,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- seq_printf(m, "PollList:\n");
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
+ seq_puts(m, "PollList:\n");
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10115,12 +10213,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
req->task->task_works != NULL);
}
+
+ seq_puts(m, "CqOverflowList:\n");
+ list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+ struct io_uring_cqe *cqe = &ocqe->cqe;
+
+ seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
+ cqe->user_data, cqe->res, cqe->flags);
+
+ }
+
spin_unlock(&ctx->completion_lock);
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
}
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
@@ -10144,8 +10250,8 @@ static const struct file_operations io_uring_fops = {
#endif
};
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
@@ -10221,8 +10327,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return ERR_PTR(ret);
#endif
- file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
- O_RDWR | O_CLOEXEC);
+ file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
sock_release(ctx->ring_sock);
@@ -10234,8 +10340,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return file;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
@@ -10393,7 +10499,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
@@ -10449,8 +10556,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
@@ -10584,7 +10691,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -10610,8 +10717,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
- unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
@@ -10637,7 +10744,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
@@ -10647,8 +10754,8 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
@@ -10684,10 +10791,11 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
- memcpy(ctx->iowq_limits, new_count, sizeof(new_count));
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i])
+ ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
- ret = -EINVAL;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
@@ -10753,7 +10861,7 @@ static bool io_register_op_must_quiesce(int op)
}
}
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
{
long ret;
@@ -10768,10 +10876,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx)
*/
mutex_unlock(&ctx->uring_lock);
do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
+ ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
+ if (ret) {
+ ret = min(0L, ret);
break;
+ }
+
ret = io_run_task_work_sig();
+ io_req_caches_free(ctx);
} while (ret >= 0);
mutex_lock(&ctx->uring_lock);
@@ -11002,6 +11114,8 @@ static int __init io_uring_init(void)
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+ BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+ BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9cc5798423d1..1753c26c8e76 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -750,7 +750,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4ecd255e0511..b4dc51063d36 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -31,6 +31,7 @@ struct iomap_dio {
atomic_t ref;
unsigned flags;
int error;
+ size_t done_before;
bool wait_for_completion;
union {
@@ -38,8 +39,7 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct request_queue *last_queue;
- blk_qc_t cookie;
+ struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -49,29 +49,20 @@ struct iomap_dio {
};
};
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
- struct request_queue *q = READ_ONCE(kiocb->private);
-
- if (!q)
- return 0;
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI)
+ if (dio->iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, dio->iocb);
+ dio->submit.poll_bio = bio;
+ }
- dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev);
if (dio->dops && dio->dops->submit_io)
- dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
+ dio->dops->submit_io(iter, bio, pos);
else
- dio->submit.cookie = submit_bio(bio);
+ submit_bio(bio);
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -124,6 +115,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
ret = generic_write_sync(iocb, ret);
+ if (ret > 0)
+ ret += dio->done_before;
+
kfree(dio);
return ret;
@@ -135,7 +129,7 @@ static void iomap_dio_complete_work(struct work_struct *work)
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio));
}
/*
@@ -164,9 +158,11 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
+ WRITE_ONCE(dio->iocb->private, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
+ WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
}
@@ -282,6 +278,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (!iov_iter_count(dio->submit.iter))
goto out;
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
+
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
@@ -339,6 +342,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ if (nr_pages)
+ dio->iocb->ki_flags &= ~IOCB_HIPRI;
iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
@@ -371,6 +379,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
+ if (!length)
+ return -EFAULT;
return length;
}
@@ -402,6 +412,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
+ if (!copied)
+ return -EFAULT;
return copied;
}
@@ -446,13 +458,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
* may be pure data writes. In that case, we still need to do a full data sync
* completion.
*
+ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
+ * __iomap_dio_rw can return a partial result if it encounters a non-resident
+ * page in @iter after preparing a transfer. In that case, the non-resident
+ * pages can be faulted in and the request resumed with @done_before set to the
+ * number of bytes previously transferred. The request will then complete with
+ * the correct total number of bytes transferred; this is essential for
+ * completing partial requests asynchronously.
+ *
* Returns -ENOTBLK In case of a page invalidation invalidation failure for
* writes. The callers needs to fall back to buffered I/O in this case.
*/
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags)
+ unsigned int dio_flags, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -482,11 +502,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->dops = dops;
dio->error = 0;
dio->flags = 0;
+ dio->done_before = done_before;
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.cookie = BLK_QC_T_NONE;
- dio->submit.last_queue = NULL;
+ dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
@@ -565,8 +585,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
inode_dio_begin(inode);
blk_start_plug(&plug);
- while ((ret = iomap_iter(&iomi, ops)) > 0)
+ while ((ret = iomap_iter(&iomi, ops)) > 0) {
iomi.processed = iomap_dio_iter(&iomi, dio);
+
+ /*
+ * We can only poll for single bio I/Os.
+ */
+ iocb->ki_flags &= ~IOCB_HIPRI;
+ }
+
blk_finish_plug(&plug);
/*
@@ -577,6 +604,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
iov_iter_revert(iter, iomi.pos - dio->i_size);
+ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
+ if (!(iocb->ki_flags & IOCB_NOWAIT))
+ wait_for_completion = true;
+ ret = 0;
+ }
+
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
wait_for_completion = true;
@@ -592,8 +625,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
- WRITE_ONCE(iocb->private, dio->submit.last_queue);
+ WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/*
* We are about to drop our additional submission reference, which
@@ -620,10 +652,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !dio->submit.last_queue ||
- !blk_poll(dio->submit.last_queue,
- dio->submit.cookie, true))
+ if (!dio->submit.poll_bio ||
+ !bio_poll(dio->submit.poll_bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -642,11 +672,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags)
+ unsigned int dio_flags, size_t done_before)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 678e2c51b855..0c6eacfcbeef 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
de = (struct iso_directory_record *) (bh->b_data + offset);
de_len = *(unsigned char *) de;
+ if (de_len < sizeof(struct iso_directory_record))
+ goto fail;
if (offset + de_len > bufsize) {
int frag1 = bufsize - offset;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 176580f54af9..104ae698443e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 5d7d7170c03c..aa4ff7bcaff2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -81,14 +81,14 @@ int jfs_mount(struct super_block *sb)
* (initialize mount inode from the superblock)
*/
if ((rc = chkSuper(sb))) {
- goto errout20;
+ goto out;
}
ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
if (ipaimap == NULL) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout20;
+ goto out;
}
sbi->ipaimap = ipaimap;
@@ -99,7 +99,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = diMount(ipaimap))) {
jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
- goto errout21;
+ goto err_ipaimap;
}
/*
@@ -108,7 +108,7 @@ int jfs_mount(struct super_block *sb)
ipbmap = diReadSpecial(sb, BMAP_I, 0);
if (ipbmap == NULL) {
rc = -EIO;
- goto errout22;
+ goto err_umount_ipaimap;
}
jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
@@ -120,7 +120,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((rc = dbMount(ipbmap))) {
jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
- goto errout22;
+ goto err_ipbmap;
}
/*
@@ -139,7 +139,7 @@ int jfs_mount(struct super_block *sb)
if (!ipaimap2) {
jfs_err("jfs_mount: Failed to read AGGREGATE_I");
rc = -EIO;
- goto errout35;
+ goto err_umount_ipbmap;
}
sbi->ipaimap2 = ipaimap2;
@@ -151,7 +151,7 @@ int jfs_mount(struct super_block *sb)
if ((rc = diMount(ipaimap2))) {
jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
rc);
- goto errout35;
+ goto err_ipaimap2;
}
} else
/* Secondary aggregate inode table is not valid */
@@ -168,7 +168,7 @@ int jfs_mount(struct super_block *sb)
jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
/* open fileset secondary inode allocation map */
rc = -EIO;
- goto errout40;
+ goto err_umount_ipaimap2;
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
@@ -178,41 +178,34 @@ int jfs_mount(struct super_block *sb)
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
- goto errout41;
+ goto err_ipimap;
}
- goto out;
+ return rc;
/*
* unwind on error
*/
- errout41: /* close fileset inode allocation map inode */
+err_ipimap:
+ /* close fileset inode allocation map inode */
diFreeSpecial(ipimap);
-
- errout40: /* fileset closed */
-
+err_umount_ipaimap2:
/* close secondary aggregate inode allocation map */
- if (ipaimap2) {
+ if (ipaimap2)
diUnmount(ipaimap2, 1);
+err_ipaimap2:
+ /* close aggregate inodes */
+ if (ipaimap2)
diFreeSpecial(ipaimap2);
- }
-
- errout35:
-
- /* close aggregate block allocation map */
+err_umount_ipbmap: /* close aggregate block allocation map */
dbUnmount(ipbmap, 1);
+err_ipbmap: /* close aggregate inodes */
diFreeSpecial(ipbmap);
-
- errout22: /* close aggregate inode allocation map */
-
+err_umount_ipaimap: /* close aggregate inode allocation map */
diUnmount(ipaimap, 1);
-
- errout21: /* close aggregate inodes */
+err_ipaimap: /* close aggregate inodes */
diFreeSpecial(ipaimap);
- errout20: /* aggregate closed */
-
- out:
-
+out:
if (rc)
jfs_err("Mount JFS Failure: %d", rc);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index bde787c354fc..8b9a72ae5efa 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
goto out;
}
- VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits;
-
+ VolumeSize = sb_bdev_nr_blocks(sb);
if (VolumeSize) {
if (newLVSize > VolumeSize) {
printk(KERN_WARNING "jfs_extendfs: invalid size\n");
@@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
txQuiesce(sb);
/* Reset size of direct inode */
- sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev);
if (sbi->mntflag & JFS_INLINELOG) {
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9030aeaf0f88..24cbc9946e01 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_resize_nosize:
{
- *newLVSize = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ *newLVSize = sb_bdev_nr_blocks(sb);
if (*newLVSize == 0)
pr_err("JFS: Cannot determine volume size\n");
break;
@@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto out_unload;
}
- inode->i_size = i_size_read(sb->s_bdev->bd_inode);
+ inode->i_size = bdev_nr_bytes(sb->s_bdev);
inode->i_mapping->a_ops = &jfs_metapage_aops;
inode_fake_hash(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index c8f8e41b8411..19a6c71c6ff5 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -36,8 +36,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
gid = target->iattr->ia_gid;
}
- kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid,
- KERNFS_LINK);
+ kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK);
if (!kn)
return ERR_PTR(-ENOMEM);
diff --git a/fs/libfs.c b/fs/libfs.c
index 51b4de3b3447..ba7438ab9371 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -448,6 +448,30 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL(simple_rmdir);
+int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ bool old_is_dir = d_is_dir(old_dentry);
+ bool new_is_dir = d_is_dir(new_dentry);
+
+ if (old_dir != new_dir && old_is_dir != new_is_dir) {
+ if (old_is_dir) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ } else {
+ drop_nlink(new_dir);
+ inc_nlink(old_dir);
+ }
+ }
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ d_inode(old_dentry)->i_ctime =
+ d_inode(new_dentry)->i_ctime = current_time(old_dir);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_rename_exchange);
+
int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
@@ -455,9 +479,12 @@ int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
+ if (flags & RENAME_EXCHANGE)
+ return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index b11f2afa84f1..99fffc9cb958 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -794,9 +794,6 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
goto retry_cancel;
}
- dprintk("lockd: cancel status %u (task %u)\n",
- status, task->tk_pid);
-
switch (status) {
case NLM_LCK_GRANTED:
case NLM_LCK_DENIED_GRACE_PERIOD:
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index b632be3ad57b..b220e1b91726 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -780,11 +780,9 @@ module_exit(exit_nlm);
static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
const struct svc_procedure *procp = rqstp->rq_procinfo;
- struct kvec *argv = rqstp->rq_arg.head;
- struct kvec *resv = rqstp->rq_res.head;
svcxdr_init_decode(rqstp);
- if (!procp->pc_decode(rqstp, argv->iov_base))
+ if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
*statp = procp->pc_func(rqstp);
@@ -794,7 +792,7 @@ static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
return 1;
svcxdr_init_encode(rqstp);
- if (!procp->pc_encode(rqstp, resv->iov_base + resv->iov_len))
+ if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
return 1;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index e10ae2c41279..176b468a61c7 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -269,8 +269,6 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp)
*/
static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
{
- dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
- -task->tk_status);
}
static void nlm4svc_callback_release(void *data)
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 99696d3f6dd6..4dc1b40a489a 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -301,8 +301,6 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp)
*/
static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
{
- dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
- -task->tk_status);
}
void nlmsvc_release_call(struct nlm_rqst *call)
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9235e60b1769..2fb5748dae0c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -145,137 +145,131 @@ svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
* Decode Call arguments
*/
-int
-nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
argp->monitor = 1; /* monitor client by default */
- return 1;
+ return true;
}
-int
-nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
argp->lock.fl.fl_type = F_UNLCK;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_res *resp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_stats(xdr, &resp->status))
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_reboot *argp = rqstp->rq_argp;
+ __be32 *p;
u32 len;
if (xdr_stream_decode_u32(xdr, &len) < 0)
- return 0;
+ return false;
if (len > SM_MAXSTRLEN)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, len);
if (!p)
- return 0;
+ return false;
argp->len = len;
argp->mon = (char *)p;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
if (!p)
- return 0;
+ return false;
memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
- return 1;
+ return true;
}
-int
-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
@@ -284,35 +278,34 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
lock->svid = ~(u32)0;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (!svcxdr_decode_fhandle(xdr, &lock->fh))
- return 0;
+ return false;
if (!svcxdr_decode_owner(xdr, &lock->oh))
- return 0;
+ return false;
/* XXX: Range checks are missing in the original code */
if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
@@ -320,45 +313,42 @@ nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
* Encode Reply results
*/
-int
-nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_testrply(xdr, resp);
}
-int
-nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_stats(xdr, resp->status);
}
-int
-nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
if (!svcxdr_encode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_encode_stats(xdr, resp->status))
- return 0;
+ return false;
/* sequence */
if (xdr_stream_encode_u32(xdr, 0) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 98e957e4566c..856267c0864b 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -144,136 +144,131 @@ svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
* Decode Call arguments
*/
-int
-nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
argp->monitor = 1; /* monitor client by default */
- return 1;
+ return true;
}
-int
-nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
u32 exclusive;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return 1;
+
+ return true;
}
-int
-nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_lock(xdr, &argp->lock))
- return 0;
+ return false;
argp->lock.fl.fl_type = F_UNLCK;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_res *resp = rqstp->rq_argp;
if (!svcxdr_decode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_stats(xdr, &resp->status))
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_reboot *argp = rqstp->rq_argp;
+ __be32 *p;
u32 len;
if (xdr_stream_decode_u32(xdr, &len) < 0)
- return 0;
+ return false;
if (len > SM_MAXSTRLEN)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, len);
if (!p)
- return 0;
+ return false;
argp->len = len;
argp->mon = (char *)p;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
if (!p)
- return 0;
+ return false;
memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
- return 1;
+ return true;
}
-int
-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
@@ -282,35 +277,34 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
lock->svid = ~(u32)0;
if (!svcxdr_decode_cookie(xdr, &argp->cookie))
- return 0;
+ return false;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (!svcxdr_decode_fhandle(xdr, &lock->fh))
- return 0;
+ return false;
if (!svcxdr_decode_owner(xdr, &lock->oh))
- return 0;
+ return false;
/* XXX: Range checks are missing in the original code */
if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
@@ -318,45 +312,42 @@ nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
* Encode Reply results
*/
-int
-nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
-int
-nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_testrply(xdr, resp);
}
-int
-nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
return svcxdr_encode_cookie(xdr, &resp->cookie) &&
svcxdr_encode_stats(xdr, resp->status);
}
-int
-nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nlm_res *resp = rqstp->rq_resp;
if (!svcxdr_encode_cookie(xdr, &resp->cookie))
- return 0;
+ return false;
if (!svcxdr_encode_stats(xdr, resp->status))
- return 0;
+ return false;
/* sequence */
if (xdr_stream_encode_u32(xdr, 0) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..0fca9d680978 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2,117 +2,11 @@
/*
* linux/fs/locks.c
*
- * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
- * Doug Evans (dje@spiff.uucp), August 07, 1992
+ * We implement four types of file locks: BSD locks, posix locks, open
+ * file description locks, and leases. For details about BSD locks,
+ * see the flock(2) man page; for details about the other three, see
+ * fcntl(2).
*
- * Deadlock detection added.
- * FIXME: one thing isn't handled yet:
- * - mandatory locks (requires lots of changes elsewhere)
- * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
- *
- * Miscellaneous edits, and a total rewrite of posix_lock_file() code.
- * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
- *
- * Converted file_lock_table to a linked list from an array, which eliminates
- * the limits on how many active file locks are open.
- * Chad Page (pageone@netcom.com), November 27, 1994
- *
- * Removed dependency on file descriptors. dup()'ed file descriptors now
- * get the same locks as the original file descriptors, and a close() on
- * any file descriptor removes ALL the locks on the file for the current
- * process. Since locks still depend on the process id, locks are inherited
- * after an exec() but not after a fork(). This agrees with POSIX, and both
- * BSD and SVR4 practice.
- * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
- *
- * Scrapped free list which is redundant now that we allocate locks
- * dynamically with kmalloc()/kfree().
- * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
- *
- * Implemented two lock personalities - FL_FLOCK and FL_POSIX.
- *
- * FL_POSIX locks are created with calls to fcntl() and lockf() through the
- * fcntl() system call. They have the semantics described above.
- *
- * FL_FLOCK locks are created with calls to flock(), through the flock()
- * system call, which is new. Old C libraries implement flock() via fcntl()
- * and will continue to use the old, broken implementation.
- *
- * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
- * with a file pointer (filp). As a result they can be shared by a parent
- * process and its children after a fork(). They are removed when the last
- * file descriptor referring to the file pointer is closed (unless explicitly
- * unlocked).
- *
- * FL_FLOCK locks never deadlock, an existing lock is always removed before
- * upgrading from shared to exclusive (or vice versa). When this happens
- * any processes blocked by the current lock are woken up and allowed to
- * run before the new lock is applied.
- * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
- *
- * Removed some race conditions in flock_lock_file(), marked other possible
- * races. Just grep for FIXME to see them.
- * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
- *
- * Addressed Dmitry's concerns. Deadlock checking no longer recursive.
- * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
- * once we've checked for blocking and deadlocking.
- * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
- *
- * Initial implementation of mandatory locks. SunOS turned out to be
- * a rotten model, so I implemented the "obvious" semantics.
- * See 'Documentation/filesystems/mandatory-locking.rst' for details.
- * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
- *
- * Don't allow mandatory locks on mmap()'ed files. Added simple functions to
- * check if a file has mandatory locks, used by mmap(), open() and creat() to
- * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
- * Manual, Section 2.
- * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
- *
- * Tidied up block list handling. Added '/proc/locks' interface.
- * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
- *
- * Fixed deadlock condition for pathological code that mixes calls to
- * flock() and fcntl().
- * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
- *
- * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
- * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
- * guarantee sensible behaviour in the case where file system modules might
- * be compiled with different options than the kernel itself.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
- * (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
- * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
- *
- * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
- * locks. Changed process synchronisation to avoid dereferencing locks that
- * have already been freed.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
- *
- * Made the block list a circular list to minimise searching in the list.
- * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
- *
- * Made mandatory locking a mount option. Default is not to allow mandatory
- * locking.
- * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
- *
- * Some adaptations for NFS support.
- * Olaf Kirch (okir@monad.swb.de), Dec 1996,
- *
- * Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
- * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
- *
- * Use slab allocator instead of kmalloc/kfree.
- * Use generic list implementation from <linux/list.h>.
- * Sped up posix_locks_deadlock by only considering blocked locks.
- * Matthew Wilcox <willy@debian.org>, March, 2000.
- *
- * Leases and LOCK_MAND
- * Matthew Wilcox <willy@debian.org>, June, 2000.
- * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
*
* Locking conflicts and dependencies:
* If multiple threads attempt to lock the same byte (or flock the same file)
@@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
}
static inline int flock_translate_cmd(int cmd) {
- if (cmd & LOCK_MAND)
- return cmd & (LOCK_MAND | LOCK_RW);
switch (cmd) {
case LOCK_SH:
return F_RDLCK;
@@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl,
*/
if (caller_fl->fl_file == sys_fl->fl_file)
return false;
- if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
- return false;
return locks_conflict(caller_fl, sys_fl);
}
@@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
* - %LOCK_SH -- a shared lock.
* - %LOCK_EX -- an exclusive lock.
* - %LOCK_UN -- remove an existing lock.
- * - %LOCK_MAND -- a 'mandatory' flock.
- * This exists to emulate Windows Share Modes.
+ * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
*
- * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
- * processes read and write access respectively.
+ * %LOCK_MAND support has been removed from the kernel.
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
@@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
cmd &= ~LOCK_NB;
unlock = (cmd == LOCK_UN);
- if (!unlock && !(cmd & LOCK_MAND) &&
- !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
+ goto out_putf;
+
+ /*
+ * LOCK_MAND locks were broken for a long time in that they never
+ * conflicted with one another and didn't prevent any sort of open,
+ * read or write activity.
+ *
+ * Just ignore these requests now, to preserve legacy behavior, but
+ * throw a warning to let people know that they don't actually work.
+ */
+ if (cmd & LOCK_MAND) {
+ pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ error = 0;
goto out_putf;
+ }
lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
@@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
struct inode *inode = NULL;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+ int type;
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
@@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
- if (fl->fl_type & LOCK_MAND) {
- seq_puts(f, "FLOCK MSNFS ");
- } else {
- seq_puts(f, "FLOCK ADVISORY ");
- }
+ seq_puts(f, "FLOCK ADVISORY ");
} else if (IS_LEASE(fl)) {
if (fl->fl_flags & FL_DELEG)
seq_puts(f, "DELEG ");
@@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
- if (fl->fl_type & LOCK_MAND) {
- seq_printf(f, "%s ",
- (fl->fl_type & LOCK_READ)
- ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
- : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
- } else {
- int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
+ type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
- seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
- (type == F_RDLCK) ? "READ" : "UNLCK");
- }
+ seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
+ (type == F_RDLCK) ? "READ" : "UNLCK");
if (inode) {
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
diff --git a/fs/namei.c b/fs/namei.c
index 1946d9667790..1f9d2187c765 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
int error = get_write_access(inode);
if (error)
return error;
- /*
- * Refuse to truncate files with mandatory locks held on them.
- */
+
error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 994ec22d4040..9320a42dfaf9 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -230,7 +230,7 @@ static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async
/*
* Deal with the completion of writing the data to the cache. We have to clear
- * the PG_fscache bits on the pages involved and release the caller's ref.
+ * the PG_fscache bits on the folios involved and release the caller's ref.
*
* May be called in softirq mode and we inherit a ref from the caller.
*/
@@ -238,7 +238,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
bool was_async)
{
struct netfs_read_subrequest *subreq;
- struct page *page;
+ struct folio *folio;
pgoff_t unlocked = 0;
bool have_unlocked = false;
@@ -247,14 +247,14 @@ static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
- xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+ xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
/* We might have multiple writes from the same huge
- * page, but we mustn't unlock a page more than once.
+ * folio, but we mustn't unlock a folio more than once.
*/
- if (have_unlocked && page->index <= unlocked)
+ if (have_unlocked && folio_index(folio) <= unlocked)
continue;
- unlocked = page->index;
- end_page_fscache(page);
+ unlocked = folio_index(folio);
+ folio_end_fscache(folio);
have_unlocked = true;
}
}
@@ -367,18 +367,17 @@ static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq,
}
/*
- * Unlock the pages in a read operation. We need to set PG_fscache on any
- * pages we're going to write back before we unlock them.
+ * Unlock the folios in a read operation. We need to set PG_fscache on any
+ * folios we're going to write back before we unlock them.
*/
static void netfs_rreq_unlock(struct netfs_read_request *rreq)
{
struct netfs_read_subrequest *subreq;
- struct page *page;
+ struct folio *folio;
unsigned int iopos, account = 0;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
bool subreq_failed = false;
- int i;
XA_STATE(xas, &rreq->mapping->i_pages, start_page);
@@ -403,9 +402,9 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq)
trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
rcu_read_lock();
- xas_for_each(&xas, page, last_page) {
- unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
- unsigned int pgend = pgpos + thp_size(page);
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
bool pg_failed = false;
for (;;) {
@@ -414,7 +413,7 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq)
break;
}
if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
- set_page_fscache(page);
+ folio_start_fscache(folio);
pg_failed |= subreq_failed;
if (pgend < iopos + subreq->len)
break;
@@ -433,17 +432,16 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq)
}
if (!pg_failed) {
- for (i = 0; i < thp_nr_pages(page); i++)
- flush_dcache_page(page);
- SetPageUptodate(page);
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) {
- if (page->index == rreq->no_unlock_page &&
- test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags))
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+ if (folio_index(folio) == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
- unlock_page(page);
+ folio_unlock(folio);
}
}
rcu_read_unlock();
@@ -876,7 +874,6 @@ void netfs_readahead(struct readahead_control *ractl,
void *netfs_priv)
{
struct netfs_read_request *rreq;
- struct page *page;
unsigned int debug_index = 0;
int ret;
@@ -911,11 +908,11 @@ void netfs_readahead(struct readahead_control *ractl,
} while (rreq->submitted < rreq->len);
- /* Drop the refs on the pages here rather than in the cache or
+ /* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
- while ((page = readahead_page(ractl)))
- put_page(page);
+ while (readahead_folio(ractl))
+ ;
/* If we decrement nr_rd_ops to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_rd_ops))
@@ -935,7 +932,7 @@ EXPORT_SYMBOL(netfs_readahead);
/**
* netfs_readpage - Helper to manage a readpage request
* @file: The file to read from
- * @page: The page to read
+ * @folio: The folio to read
* @ops: The network filesystem's operations for the helper to use
* @netfs_priv: Private netfs data to be retained in the request
*
@@ -950,7 +947,7 @@ EXPORT_SYMBOL(netfs_readahead);
* This is usable whether or not caching is enabled.
*/
int netfs_readpage(struct file *file,
- struct page *page,
+ struct folio *folio,
const struct netfs_read_request_ops *ops,
void *netfs_priv)
{
@@ -958,23 +955,23 @@ int netfs_readpage(struct file *file,
unsigned int debug_index = 0;
int ret;
- _enter("%lx", page_index(page));
+ _enter("%lx", folio_index(folio));
rreq = netfs_alloc_read_request(ops, netfs_priv, file);
if (!rreq) {
if (netfs_priv)
- ops->cleanup(netfs_priv, page_file_mapping(page));
- unlock_page(page);
+ ops->cleanup(netfs_priv, folio_file_mapping(folio));
+ folio_unlock(folio);
return -ENOMEM;
}
- rreq->mapping = page_file_mapping(page);
- rreq->start = page_file_offset(page);
- rreq->len = thp_size(page);
+ rreq->mapping = folio_file_mapping(folio);
+ rreq->start = folio_file_pos(folio);
+ rreq->len = folio_size(folio);
if (ops->begin_cache_operation) {
ret = ops->begin_cache_operation(rreq);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
- unlock_page(page);
+ folio_unlock(folio);
goto out;
}
}
@@ -1012,40 +1009,40 @@ out:
EXPORT_SYMBOL(netfs_readpage);
/**
- * netfs_skip_page_read - prep a page for writing without reading first
- * @page: page being prepared
+ * netfs_skip_folio_read - prep a folio for writing without reading first
+ * @folio: The folio being prepared
* @pos: starting position for the write
* @len: length of write
*
* In some cases, write_begin doesn't need to read at all:
- * - full page write
- * - write that lies in a page that is completely beyond EOF
- * - write that covers the the page from start to EOF or beyond it
+ * - full folio write
+ * - write that lies in a folio that is completely beyond EOF
+ * - write that covers the folio from start to EOF or beyond it
*
* If any of these criteria are met, then zero out the unwritten parts
- * of the page and return true. Otherwise, return false.
+ * of the folio and return true. Otherwise, return false.
*/
-static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len)
+static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio_inode(folio);
loff_t i_size = i_size_read(inode);
- size_t offset = offset_in_thp(page, pos);
+ size_t offset = offset_in_folio(folio, pos);
- /* Full page write */
- if (offset == 0 && len >= thp_size(page))
+ /* Full folio write */
+ if (offset == 0 && len >= folio_size(folio))
return true;
- /* pos beyond last page in the file */
+ /* pos beyond last folio in the file */
if (pos - offset >= i_size)
goto zero_out;
- /* Write that covers from the start of the page to EOF or beyond */
+ /* Write that covers from the start of the folio to EOF or beyond */
if (offset == 0 && (pos + len) >= i_size)
goto zero_out;
return false;
zero_out:
- zero_user_segments(page, 0, offset, offset + len, thp_size(page));
+ zero_user_segments(&folio->page, 0, offset, offset + len, folio_size(folio));
return true;
}
@@ -1054,9 +1051,9 @@ zero_out:
* @file: The file to read from
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
- * @len: The length of the write (may extend beyond the end of the page chosen)
- * @flags: AOP_* flags
- * @_page: Where to put the resultant page
+ * @len: The length of the write (may extend beyond the end of the folio chosen)
+ * @aop_flags: AOP_* flags
+ * @_folio: Where to put the resultant folio
* @_fsdata: Place for the netfs to store a cookie
* @ops: The network filesystem's operations for the helper to use
* @netfs_priv: Private netfs data to be retained in the request
@@ -1072,37 +1069,41 @@ zero_out:
* issue_op, is mandatory.
*
* The check_write_begin() operation can be provided to check for and flush
- * conflicting writes once the page is grabbed and locked. It is passed a
+ * conflicting writes once the folio is grabbed and locked. It is passed a
* pointer to the fsdata cookie that gets returned to the VM to be passed to
* write_end. It is permitted to sleep. It should return 0 if the request
- * should go ahead; unlock the page and return -EAGAIN to cause the page to be
- * regot; or return an error.
+ * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
+ * be regot; or return an error.
*
* This is usable whether or not caching is enabled.
*/
int netfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
- struct page **_page, void **_fsdata,
+ loff_t pos, unsigned int len, unsigned int aop_flags,
+ struct folio **_folio, void **_fsdata,
const struct netfs_read_request_ops *ops,
void *netfs_priv)
{
struct netfs_read_request *rreq;
- struct page *page, *xpage;
+ struct folio *folio;
struct inode *inode = file_inode(file);
- unsigned int debug_index = 0;
+ unsigned int debug_index = 0, fgp_flags;
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
+ fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+ if (aop_flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+ if (!folio)
return -ENOMEM;
if (ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
- ret = ops->check_write_begin(file, pos, len, page, _fsdata);
+ ret = ops->check_write_begin(file, pos, len, folio, _fsdata);
if (ret < 0) {
trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
if (ret == -EAGAIN)
@@ -1111,28 +1112,28 @@ retry:
}
}
- if (PageUptodate(page))
- goto have_page;
+ if (folio_test_uptodate(folio))
+ goto have_folio;
/* If the page is beyond the EOF, we want to clear it - unless it's
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
if (!ops->is_cache_enabled(inode) &&
- netfs_skip_page_read(page, pos, len)) {
+ netfs_skip_folio_read(folio, pos, len)) {
netfs_stat(&netfs_n_rh_write_zskip);
- goto have_page_no_wait;
+ goto have_folio_no_wait;
}
ret = -ENOMEM;
rreq = netfs_alloc_read_request(ops, netfs_priv, file);
if (!rreq)
goto error;
- rreq->mapping = page->mapping;
- rreq->start = page_offset(page);
- rreq->len = thp_size(page);
- rreq->no_unlock_page = page->index;
- __set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags);
+ rreq->mapping = folio_file_mapping(folio);
+ rreq->start = folio_file_pos(folio);
+ rreq->len = folio_size(folio);
+ rreq->no_unlock_folio = folio_index(folio);
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
netfs_priv = NULL;
if (ops->begin_cache_operation) {
@@ -1147,14 +1148,14 @@ retry:
/* Expand the request to meet caching requirements and download
* preferences.
*/
- ractl._nr_pages = thp_nr_pages(page);
+ ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
netfs_get_read_request(rreq);
- /* We hold the page locks, so we can drop the references */
- while ((xpage = readahead_page(&ractl)))
- if (xpage != page)
- put_page(xpage);
+ /* We hold the folio locks, so we can drop the references */
+ folio_get(folio);
+ while (readahead_folio(&ractl))
+ ;
atomic_set(&rreq->nr_rd_ops, 1);
do {
@@ -1184,22 +1185,22 @@ retry:
if (ret < 0)
goto error;
-have_page:
- ret = wait_on_page_fscache_killable(page);
+have_folio:
+ ret = folio_wait_fscache_killable(folio);
if (ret < 0)
goto error;
-have_page_no_wait:
+have_folio_no_wait:
if (netfs_priv)
ops->cleanup(netfs_priv, mapping);
- *_page = page;
+ *_folio = folio;
_leave(" = 0");
return 0;
error_put:
netfs_put_read_request(rreq, false);
error:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (netfs_priv)
ops->cleanup(netfs_priv, mapping);
_leave(" = %d", ret);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index acb1d22907da..5e56da748b2a 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
@@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return PTR_ERR(bdev);
d->bdev = bdev;
- d->len = i_size_read(d->bdev->bd_inode);
+ d->len = bdev_nr_bytes(d->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ed9d580826f5..09c5b1cb3e07 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -739,6 +739,9 @@ out:
kfree(copy);
spin_unlock(&cps->clp->cl_lock);
+ trace_nfs4_cb_offload(&args->coa_fh, &args->coa_stateid,
+ args->wr_count, args->error,
+ args->wr_writeverf.committed);
return 0;
}
#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 4c48d85f6517..a67c41ec545f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -67,9 +67,9 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
* svc_process_common() looks for an XDR encoder to know when
* not to drop a Reply.
*/
-static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
+static bool nfs4_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return xdr_ressize_check(rqstp, p);
+ return true;
}
static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 23e165d5ec9c..1e4dc1ab9312 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -828,7 +828,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
/*
* Probe filesystem information, including the FSID on v2/v3
*/
-int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
{
struct nfs_fsinfo fsinfo;
struct nfs_client *clp = server->nfs_client;
@@ -862,7 +862,30 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
return 0;
}
-EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh)
+{
+ struct nfs_fattr *fattr;
+ int error;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* Sanity: the probe won't work if the destination server
+ * does not recognize the migrated FH. */
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+
+ nfs_free_fattr(fattr);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_probe_server);
/*
* Copy useful information when duplicating a server record
@@ -1025,7 +1048,7 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)
if (!(fattr->valid & NFS_ATTR_FATTR)) {
error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
- fattr, NULL, NULL);
+ fattr, NULL);
if (error < 0) {
dprintk("nfs_create_server: getattr error = %d\n", -error);
goto error;
@@ -1058,7 +1081,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
rpc_authflavor_t flavor)
{
struct nfs_server *server;
- struct nfs_fattr *fattr_fsinfo;
int error;
server = nfs_alloc_server();
@@ -1067,11 +1089,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
server->cred = get_cred(source->cred);
- error = -ENOMEM;
- fattr_fsinfo = nfs_alloc_fattr();
- if (fattr_fsinfo == NULL)
- goto out_free_server;
-
/* Copy data from the source */
server->nfs_client = source->nfs_client;
server->destroy = source->destroy;
@@ -1087,7 +1104,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
goto out_free_server;
/* probe the filesystem info for this server filesystem */
- error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
+ error = nfs_probe_server(server, fh);
if (error < 0)
goto out_free_server;
@@ -1101,11 +1118,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
nfs_server_insert_lists(server);
server->mount_time = jiffies;
- nfs_free_fattr(fattr_fsinfo);
return server;
out_free_server:
- nfs_free_fattr(fattr_fsinfo);
nfs_free_server(server);
return ERR_PTR(error);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 11118398f495..7c9eb679dbdb 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -755,11 +755,13 @@ int nfs4_inode_return_delegation(struct inode *inode)
struct nfs_delegation *delegation;
delegation = nfs_start_delegation_return(nfsi);
- /* Synchronous recall of any application leases */
- break_lease(inode, O_WRONLY | O_RDWR);
- nfs_wb_all(inode);
- if (delegation != NULL)
+ if (delegation != NULL) {
+ /* Synchronous recall of any application leases */
+ break_lease(inode, O_WRONLY | O_RDWR);
+ if (S_ISREG(inode->i_mode))
+ nfs_wb_all(inode);
return nfs_end_delegation_return(inode, delegation, 1);
+ }
return 0;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1a6d2867fba4..731d31015b6a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -78,6 +78,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
ctx->attr_gencount = nfsi->attr_gencount;
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
+ ctx->page_index = 0;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -85,6 +86,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
list_add(&ctx->list, &nfsi->open_files);
+ clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
spin_unlock(&dir->i_lock);
return ctx;
}
@@ -411,7 +413,8 @@ out_eof:
static bool
nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
{
- if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+ if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_DATA))
return false;
smp_rmb();
return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
@@ -626,8 +629,7 @@ void nfs_force_use_readdirplus(struct inode *dir)
if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
!list_empty(&nfsi->open_files)) {
set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
- invalidate_mapping_pages(dir->i_mapping,
- nfsi->page_index + 1, -1);
+ set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
}
}
@@ -680,7 +682,7 @@ again:
nfs_set_verifier(dentry, dir_verifier);
status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
- nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
+ nfs_setsecurity(d_inode(dentry), entry->fattr);
goto out;
} else {
d_invalidate(dentry);
@@ -694,7 +696,7 @@ again:
goto out;
}
- inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
alias = d_splice_alias(inode, dentry);
d_lookup_done(dentry);
if (alias) {
@@ -730,8 +732,8 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
xdr_set_scratch_page(&stream, scratch);
do {
- if (entry->label)
- entry->label->len = NFS4_MAXLABELLEN;
+ if (entry->fattr->label)
+ entry->fattr->label->len = NFS4_MAXLABELLEN;
status = xdr_decode(desc, entry, &stream);
if (status != 0)
@@ -836,21 +838,15 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
return -ENOMEM;
entry->cookie = nfs_readdir_page_last_cookie(page);
entry->fh = nfs_alloc_fhandle();
- entry->fattr = nfs_alloc_fattr();
+ entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
entry->server = NFS_SERVER(inode);
if (entry->fh == NULL || entry->fattr == NULL)
goto out;
- entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
- if (IS_ERR(entry->label)) {
- status = PTR_ERR(entry->label);
- goto out;
- }
-
array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
pages = nfs_readdir_alloc_pages(array_size);
if (!pages)
- goto out_release_label;
+ goto out;
do {
unsigned int pglen;
@@ -873,8 +869,6 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
} while (!status && nfs_readdir_page_needs_filling(page));
nfs_readdir_free_pages(pages, array_size);
-out_release_label:
- nfs4_label_free(entry->label);
out:
nfs_free_fattr(entry->fattr);
nfs_free_fhandle(entry->fh);
@@ -937,10 +931,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
sizeof(nfsi->cookieverf));
}
res = nfs_readdir_search_array(desc);
- if (res == 0) {
- nfsi->page_index = desc->page_index;
+ if (res == 0)
return 0;
- }
nfs_readdir_page_unlock_and_put_cached(desc);
return res;
}
@@ -1079,6 +1071,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_dir_context *dir_ctx = file->private_data;
struct nfs_readdir_descriptor *desc;
+ pgoff_t page_index;
int res;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1109,10 +1102,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->dir_cookie = dir_ctx->dir_cookie;
desc->dup_cookie = dir_ctx->dup_cookie;
desc->duped = dir_ctx->duped;
+ page_index = dir_ctx->page_index;
desc->attr_gencount = dir_ctx->attr_gencount;
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
spin_unlock(&file->f_lock);
+ if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
+ list_is_singular(&nfsi->open_files))
+ invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+
do {
res = readdir_search_pagecache(desc);
@@ -1149,6 +1147,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
dir_ctx->dup_cookie = desc->dup_cookie;
dir_ctx->duped = desc->duped;
dir_ctx->attr_gencount = desc->attr_gencount;
+ dir_ctx->page_index = desc->page_index;
memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
spin_unlock(&file->f_lock);
@@ -1269,13 +1268,12 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
{
struct inode *inode = d_inode(dentry);
+ struct inode *dir = d_inode(dentry->d_parent);
- if (!nfs_verifier_is_delegated(dentry) &&
- !nfs_verify_change_attribute(d_inode(dentry->d_parent), verf))
- goto out;
+ if (!nfs_verify_change_attribute(dir, verf))
+ return;
if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
nfs_set_verifier_delegated(&verf);
-out:
dentry->d_time = verf;
}
@@ -1413,7 +1411,7 @@ out_force:
static void nfs_mark_dir_for_revalidate(struct inode *inode)
{
spin_lock(&inode->i_lock);
- nfs_set_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE);
spin_unlock(&inode->i_lock);
}
@@ -1495,19 +1493,17 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
{
struct nfs_fh *fhandle;
struct nfs_fattr *fattr;
- struct nfs4_label *label;
unsigned long dir_verifier;
int ret;
ret = -ENOMEM;
fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fhandle == NULL || fattr == NULL)
goto out;
dir_verifier = nfs_save_change_attribute(dir);
- ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
+ ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
if (ret < 0) {
switch (ret) {
case -ESTALE:
@@ -1526,7 +1522,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
if (nfs_refresh_inode(inode, fattr) < 0)
goto out;
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
nfs_set_verifier(dentry, dir_verifier);
/* set a readdirplus hint that we had a cache miss */
@@ -1535,7 +1531,6 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
/*
* If the lookup failed despite the dentry change attribute being
@@ -1721,10 +1716,6 @@ static void nfs_drop_nlink(struct inode *inode)
*/
static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
{
- if (S_ISDIR(inode->i_mode))
- /* drop any readdir cache as it could easily be old */
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
-
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
nfs_complete_unlink(dentry, inode);
nfs_drop_nlink(inode);
@@ -1759,7 +1750,6 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
unsigned long dir_verifier;
int error;
@@ -1778,27 +1768,23 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
res = ERR_PTR(-ENOMEM);
fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(dir));
if (fhandle == NULL || fattr == NULL)
goto out;
- label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
- if (IS_ERR(label))
- goto out;
-
dir_verifier = nfs_save_change_attribute(dir);
trace_nfs_lookup_enter(dir, dentry, flags);
- error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
- goto out_label;
+ goto out;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
res = ERR_CAST(inode);
if (IS_ERR(res))
- goto out_label;
+ goto out;
/* Notify readdir to use READDIRPLUS */
nfs_force_use_readdirplus(dir);
@@ -1807,14 +1793,12 @@ no_entry:
res = d_splice_alias(inode, dentry);
if (res != NULL) {
if (IS_ERR(res))
- goto out_label;
+ goto out;
dentry = res;
}
nfs_set_verifier(dentry, dir_verifier);
-out_label:
- trace_nfs_lookup_exit(dir, dentry, flags, error);
- nfs4_label_free(label);
out:
+ trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res));
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
return res;
@@ -2051,8 +2035,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *
nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct dentry *parent = dget_parent(dentry);
struct inode *dir = d_inode(parent);
@@ -2063,7 +2046,7 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
d_drop(dentry);
if (fhandle->size == 0) {
- error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, NULL);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
if (error)
goto out_error;
}
@@ -2071,11 +2054,11 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
if (!(fattr->valid & NFS_ATTR_FATTR)) {
struct nfs_server *server = NFS_SB(dentry->d_sb);
error = server->nfs_client->rpc_ops->getattr(server, fhandle,
- fattr, NULL, NULL);
+ fattr, NULL);
if (error < 0)
goto out_error;
}
- inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
d = d_splice_alias(inode, dentry);
out:
dput(parent);
@@ -2090,12 +2073,11 @@ EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
* Code common to create, mkdir, and mknod.
*/
int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct dentry *d;
- d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
+ d = nfs_add_or_obtain(dentry, fhandle, fattr);
if (IS_ERR(d))
return PTR_ERR(d);
@@ -2197,6 +2179,18 @@ static void nfs_dentry_handle_enoent(struct dentry *dentry)
d_delete(dentry);
}
+static void nfs_dentry_remove_handle_error(struct inode *dir,
+ struct dentry *dentry, int error)
+{
+ switch (error) {
+ case -ENOENT:
+ d_delete(dentry);
+ fallthrough;
+ case 0:
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ }
+}
+
int nfs_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
@@ -2219,6 +2213,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
} else
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ nfs_dentry_remove_handle_error(dir, dentry, error);
trace_nfs_rmdir_exit(dir, dentry, error);
return error;
@@ -2288,9 +2283,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
}
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
- if (!error || error == -ENOENT) {
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- } else if (need_rehash)
+ nfs_dentry_remove_handle_error(dir, dentry, error);
+ if (need_rehash)
d_rehash(dentry);
out:
trace_nfs_unlink_exit(dir, dentry, error);
@@ -2352,6 +2346,8 @@ int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return error;
}
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
/*
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
@@ -2385,6 +2381,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
ihold(inode);
d_add(dentry, inode);
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2e894fec036b..9cff8709c80a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
res = (long) dreq->count;
WARN_ON_ONCE(dreq->count < 0);
}
- dreq->iocb->ki_complete(dreq->iocb, res, 0);
+ dreq->iocb->ki_complete(dreq->iocb, res);
}
complete(&dreq->completion);
@@ -620,7 +620,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_unlock_and_release_request(req);
}
- if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+ if (nfs_commit_end(cinfo.mds))
nfs_direct_write_complete(dreq);
}
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index d772c20bbfd1..171c424cb6d5 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -64,7 +64,6 @@ static struct dentry *
nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
- struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw);
size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
@@ -79,7 +78,7 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
if (fh_len < len || fh_type != len)
return NULL;
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SB(sb));
if (fattr == NULL) {
dentry = ERR_PTR(-ENOMEM);
goto out;
@@ -95,28 +94,19 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
if (inode)
goto out_found;
- label = nfs4_label_alloc(NFS_SB(sb), GFP_KERNEL);
- if (IS_ERR(label)) {
- dentry = ERR_CAST(label);
- goto out_free_fattr;
- }
-
rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops;
- ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL);
+ ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, NULL);
if (ret) {
dprintk("%s: getattr failed %d\n", __func__, ret);
trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret);
dentry = ERR_PTR(ret);
- goto out_free_label;
+ goto out_free_fattr;
}
- inode = nfs_fhget(sb, server_fh, fattr, label);
+ inode = nfs_fhget(sb, server_fh, fattr);
out_found:
dentry = d_obtain_alias(inode);
-
-out_free_label:
- nfs4_label_free(label);
out_free_fattr:
nfs_free_fattr(fattr);
out:
@@ -131,7 +121,6 @@ nfs_get_parent(struct dentry *dentry)
struct super_block *sb = inode->i_sb;
struct nfs_server *server = NFS_SB(sb);
struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
struct dentry *parent;
struct nfs_rpc_ops const *ops = server->nfs_client->rpc_ops;
struct nfs_fh fh;
@@ -139,31 +128,20 @@ nfs_get_parent(struct dentry *dentry)
if (!ops->lookupp)
return ERR_PTR(-EACCES);
- fattr = nfs_alloc_fattr();
- if (fattr == NULL) {
- parent = ERR_PTR(-ENOMEM);
- goto out;
- }
+ fattr = nfs_alloc_fattr_with_label(server);
+ if (fattr == NULL)
+ return ERR_PTR(-ENOMEM);
- label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(label)) {
- parent = ERR_CAST(label);
- goto out_free_fattr;
- }
-
- ret = ops->lookupp(inode, &fh, fattr, label);
+ ret = ops->lookupp(inode, &fh, fattr);
if (ret) {
parent = ERR_PTR(ret);
- goto out_free_label;
+ goto out;
}
- pinode = nfs_fhget(sb, &fh, fattr, label);
+ pinode = nfs_fhget(sb, &fh, fattr);
parent = d_obtain_alias(pinode);
-out_free_label:
- nfs4_label_free(label);
-out_free_fattr:
- nfs_free_fattr(fattr);
out:
+ nfs_free_fattr(fattr);
return parent;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa353fd58240..24e7dccce355 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
- /*
- * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
- * any standard. In principle we might be able to support LOCK_MAND
- * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
- * NFS code is not set up for it.
- */
- if (fl->fl_type & LOCK_MAND)
- return -EINVAL;
-
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2103852475f..9c96e3e5ed35 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -293,8 +293,6 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs41_sequence_done(task, &hdr->res.seq_res);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index d383de00d486..a553d59afa8b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1414,8 +1414,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index c9b61b818ec1..bfa7202ca7be 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -378,10 +378,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
goto noconnect;
ds = mirror->mirror_ds->ds;
+ if (READ_ONCE(ds->ds_clp))
+ goto out;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
- if (ds->ds_clp)
- goto out;
/* FIXME: For now we assume the server sent only one version of NFS
* to use for the DS.
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 59355c106ece..11ff2b2e060f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -80,31 +80,28 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
goto out;
/* get the actual root for this mount */
- fsinfo.fattr = nfs_alloc_fattr();
+ fsinfo.fattr = nfs_alloc_fattr_with_label(server);
if (fsinfo.fattr == NULL)
goto out_name;
- fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(fsinfo.fattr->label))
- goto out_fattr;
error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo);
if (error < 0) {
dprintk("nfs_get_root: getattr error = %d\n", -error);
nfs_errorf(fc, "NFS: Couldn't getattr on root");
- goto out_label;
+ goto out_fattr;
}
- inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL);
+ inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
error = PTR_ERR(inode);
nfs_errorf(fc, "NFS: Couldn't get root inode");
- goto out_label;
+ goto out_fattr;
}
error = nfs_superblock_set_dummy_root(s, inode);
if (error != 0)
- goto out_label;
+ goto out_fattr;
/* root dentries normally start off anonymous and get spliced in later
* if the dentry tree reaches them; however if the dentry already
@@ -115,7 +112,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
dprintk("nfs_get_root: get root dentry failed\n");
error = PTR_ERR(root);
nfs_errorf(fc, "NFS: Couldn't get root dentry");
- goto out_label;
+ goto out_fattr;
}
security_d_instantiate(root, inode);
@@ -151,11 +148,9 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
!(kflags_out & SECURITY_LSM_NATIVE_LABELS))
server->caps &= ~NFS_CAP_SECURITY_LABEL;
- nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label);
+ nfs_setsecurity(inode, fsinfo.fattr);
error = 0;
-out_label:
- nfs4_label_free(fsinfo.fattr->label);
out_fattr:
nfs_free_fattr(fsinfo.fattr);
out_name:
@@ -165,5 +160,5 @@ out:
error_splat_root:
dput(fc->root);
fc->root = NULL;
- goto out_label;
+ goto out_fattr;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 853213b3a209..dd53704c3f40 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -210,10 +210,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
flags &= ~NFS_INO_INVALID_XATTR;
if (flags & NFS_INO_INVALID_DATA)
nfs_fscache_invalidate(inode);
- if (inode->i_mapping->nrpages == 0)
- flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
+
nfsi->cache_validity |= flags;
+
+ if (inode->i_mapping->nrpages == 0)
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA |
+ NFS_INO_DATA_INVAL_DEFER);
+ else if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
}
EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -350,37 +355,32 @@ static void nfs_clear_label_invalid(struct inode *inode)
spin_unlock(&inode->i_lock);
}
-void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr)
{
int error;
- if (label == NULL)
+ if (fattr->label == NULL)
return;
if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
- error = security_inode_notifysecctx(inode, label->label,
- label->len);
+ error = security_inode_notifysecctx(inode, fattr->label->label,
+ fattr->label->len);
if (error)
printk(KERN_ERR "%s() %s %d "
"security_inode_notifysecctx() %d\n",
__func__,
- (char *)label->label,
- label->len, error);
+ (char *)fattr->label->label,
+ fattr->label->len, error);
nfs_clear_label_invalid(inode);
}
}
struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
{
- struct nfs4_label *label = NULL;
- int minor_version = server->nfs_client->cl_minorversion;
-
- if (minor_version < 2)
- return label;
+ struct nfs4_label *label;
if (!(server->caps & NFS_CAP_SECURITY_LABEL))
- return label;
+ return NULL;
label = kzalloc(sizeof(struct nfs4_label), flags);
if (label == NULL)
@@ -397,8 +397,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
}
EXPORT_SYMBOL_GPL(nfs4_label_alloc);
#else
-void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr)
{
}
#endif
@@ -426,12 +425,28 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
return inode;
}
+static void nfs_inode_init_regular(struct nfs_inode *nfsi)
+{
+ atomic_long_set(&nfsi->nrequests, 0);
+ INIT_LIST_HEAD(&nfsi->commit_info.list);
+ atomic_long_set(&nfsi->commit_info.ncommit, 0);
+ atomic_set(&nfsi->commit_info.rpcs_out, 0);
+ mutex_init(&nfsi->commit_mutex);
+}
+
+static void nfs_inode_init_dir(struct nfs_inode *nfsi)
+{
+ nfsi->cache_change_attribute = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ init_rwsem(&nfsi->rmdir_sem);
+}
+
/*
* This is our front-end to iget that looks up inodes by file handle
* instead of inode number.
*/
struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
{
struct nfs_find_desc desc = {
.fh = fh,
@@ -480,10 +495,12 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
if (S_ISREG(inode->i_mode)) {
inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
+ nfs_inode_init_regular(nfsi);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
inode->i_data.a_ops = &nfs_dir_aops;
+ nfs_inode_init_dir(nfsi);
/* Deal with crossing mountpoints */
if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
@@ -509,7 +526,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_uid = make_kuid(&init_user_ns, -2);
inode->i_gid = make_kgid(&init_user_ns, -2);
inode->i_blocks = 0;
- memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
nfsi->write_io = 0;
nfsi->read_io = 0;
@@ -563,7 +579,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
fattr->size != 0)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -632,7 +648,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (S_ISREG(inode->i_mode))
nfs_sync_inode(inode);
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL) {
error = -ENOMEM;
goto out;
@@ -666,6 +682,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
if (err)
goto out;
+ trace_nfs_size_truncate(inode, offset);
i_size_write(inode, offset);
/* Optimisation */
if (offset == 0)
@@ -1024,7 +1041,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
ctx->cred = get_cred(filp->f_cred);
else
ctx->cred = get_current_cred();
- ctx->ll_cred = NULL;
+ rcu_assign_pointer(ctx->ll_cred, NULL);
ctx->state = NULL;
ctx->mode = f_mode;
ctx->flags = 0;
@@ -1063,7 +1080,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
put_cred(ctx->cred);
dput(ctx->dentry);
nfs_sb_deactive(sb);
- put_rpccred(ctx->ll_cred);
+ put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1));
kfree(ctx->mdsthreshold);
kfree_rcu(ctx, rcu_head);
}
@@ -1175,7 +1192,6 @@ int
__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
int status = -ESTALE;
- struct nfs4_label *label = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1197,20 +1213,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
}
status = -ENOMEM;
- fattr = nfs_alloc_fattr();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL)
goto out;
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (IS_ERR(label)) {
- status = PTR_ERR(label);
- goto out;
- }
-
- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr,
- label, inode);
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, inode);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
inode->i_sb->s_id,
@@ -1227,7 +1236,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
else
nfs_zap_caches(inode);
}
- goto err_out;
+ goto out;
}
status = nfs_refresh_inode(inode, fattr);
@@ -1235,20 +1244,18 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode), status);
- goto err_out;
+ goto out;
}
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode));
-err_out:
- nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
trace_nfs_revalidate_inode_exit(inode, status);
@@ -1446,13 +1453,12 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
&& timespec64_equal(&ts, &fattr->pre_mtime)) {
inode->i_mtime = fattr->mtime;
- if (S_ISDIR(inode->i_mode))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
}
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
&& !nfs_have_writebacks(inode)) {
+ trace_nfs_size_wcc(inode, fattr->size);
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
}
@@ -1580,12 +1586,31 @@ struct nfs_fattr *nfs_alloc_fattr(void)
struct nfs_fattr *fattr;
fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
- if (fattr != NULL)
+ if (fattr != NULL) {
nfs_fattr_init(fattr);
+ fattr->label = NULL;
+ }
return fattr;
}
EXPORT_SYMBOL_GPL(nfs_alloc_fattr);
+struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server)
+{
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
+
+ if (!fattr)
+ return NULL;
+
+ fattr->label = nfs4_label_alloc(server, GFP_NOFS);
+ if (IS_ERR(fattr->label)) {
+ kfree(fattr);
+ return NULL;
+ }
+
+ return fattr;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fattr_with_label);
+
struct nfs_fh *nfs_alloc_fhandle(void)
{
struct nfs_fh *fh;
@@ -1777,8 +1802,10 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_NLINK;
unsigned long cache_validity = NFS_I(inode)->cache_validity;
+ enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type;
- if (!(cache_validity & NFS_INO_INVALID_CHANGE) &&
+ if (ctype != NFS4_CHANGE_TYPE_IS_UNDEFINED &&
+ !(cache_validity & NFS_INO_INVALID_CHANGE) &&
(cache_validity & check_valid) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0)
@@ -2095,16 +2122,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
+ trace_nfs_size_update(inode, new_isize);
i_size_write(inode, new_isize);
if (!have_writers)
invalid |= NFS_INO_INVALID_DATA;
}
- dprintk("NFS: isize change on server for file %s/%ld "
- "(%Ld to %Ld)\n",
- inode->i_sb->s_id,
- inode->i_ino,
- (long long)cur_isize,
- (long long)new_isize);
}
if (new_isize == 0 &&
!(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED |
@@ -2155,11 +2177,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
save_cache_validity & NFS_INO_INVALID_OTHER;
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
- if (inode->i_nlink != fattr->nlink) {
- if (S_ISDIR(inode->i_mode))
- invalid |= NFS_INO_INVALID_DATA;
+ if (inode->i_nlink != fattr->nlink)
set_nlink(inode, fattr->nlink);
- }
} else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_NLINK;
@@ -2260,14 +2279,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->open_files);
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
- INIT_LIST_HEAD(&nfsi->commit_info.list);
- atomic_long_set(&nfsi->nrequests, 0);
- atomic_long_set(&nfsi->commit_info.ncommit, 0);
- atomic_set(&nfsi->commit_info.rpcs_out, 0);
- init_rwsem(&nfsi->rmdir_sem);
- mutex_init(&nfsi->commit_mutex);
nfs4_init_once(nfsi);
- nfsi->cache_change_attribute = 0;
}
static int __init nfs_init_inodecache(void)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 66fc936834f2..12f6acb483bb 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -193,7 +193,7 @@ extern void nfs_clients_exit(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
-int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
+int nfs_probe_server(struct nfs_server *, struct nfs_fh *);
void nfs_server_insert_lists(struct nfs_server *);
void nfs_server_remove_lists(struct nfs_server *);
void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
@@ -209,6 +209,7 @@ extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct fs_context *);
+extern void nfs4_server_set_init_caps(struct nfs_server *);
extern struct nfs_server *nfs4_create_server(struct fs_context *);
extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
@@ -341,14 +342,6 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
return dst;
}
-static inline void nfs4_label_free(struct nfs4_label *label)
-{
- if (label) {
- kfree(label->label);
- kfree(label);
- }
- return;
-}
static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
@@ -357,7 +350,6 @@ static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
}
#else
static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
-static inline void nfs4_label_free(void *label) {}
static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index bc0c698f3350..3295af4110f1 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -308,8 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
/* Look it up again to get its attributes */
err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry,
- ctx->mntfh, ctx->clone_data.fattr,
- NULL);
+ ctx->mntfh, ctx->clone_data.fattr);
dput(parent);
if (err != 0)
return err;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f7524310ddf4..7100514d306b 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -100,8 +100,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -193,8 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
static int
nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
unsigned short task_flags = 0;
@@ -209,7 +207,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
}
static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
const char dotdot[] = "..";
const size_t len = strlen(dotdot);
@@ -323,7 +321,7 @@ nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata
if (status != 0)
return ERR_PTR(status);
- return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
}
static void nfs3_free_createdata(struct nfs3_createdata *data)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index e6eca1d7481b..9274c9c5efea 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2227,7 +2227,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
- result->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
+ result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
return 0;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index a24349512ffe..08355b66e7cb 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -83,6 +83,10 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
status = nfs_post_op_update_inode_force_wcc(inode,
res.falloc_fattr);
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE])
+ trace_nfs4_fallocate(inode, &args, status);
+ else
+ trace_nfs4_deallocate(inode, &args, status);
kfree(res.falloc_fattr);
return status;
}
@@ -363,6 +367,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
status = nfs4_call_sync(dst_server->client, dst_server, &msg,
&args->seq_args, &res->seq_res, 0);
+ trace_nfs4_copy(src_inode, dst_inode, args, res, nss, status);
if (status == -ENOTSUPP)
dst_server->caps &= ~NFS_CAP_COPY;
if (status)
@@ -504,6 +509,7 @@ static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
{
struct nfs42_offloadcancel_data *data = calldata;
+ trace_nfs4_offload_cancel(&data->args, task->tk_status);
nfs41_sequence_done(task, &data->res.osr_seq_res);
if (task->tk_status &&
nfs4_async_handle_error(task, data->seq_server, NULL,
@@ -598,6 +604,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
status = nfs4_call_sync(src_server->client, src_server, &msg,
&args->cna_seq_args, &res->cnr_seq_res, 0);
+ trace_nfs4_copy_notify(file_inode(src), args, res, status);
if (status == -ENOTSUPP)
src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
@@ -678,6 +685,7 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
+ trace_nfs4_llseek(inode, &args, &res, status);
if (status == -ENOTSUPP)
server->caps &= ~NFS_CAP_SEEK;
if (status)
@@ -1071,6 +1079,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
status = nfs4_call_sync(server->client, server, msg,
&args.seq_args, &res.seq_res, 0);
+ trace_nfs4_clone(src_inode, dst_inode, &args, status);
if (status == 0) {
nfs42_copy_dest_done(dst_inode, dst_offset, count);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba78df4b13d9..ed5eaca6801e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -234,7 +234,6 @@ struct nfs4_opendata {
struct nfs4_string group_name;
struct nfs4_label *a_label;
struct nfs_fattr f_attr;
- struct nfs4_label *f_label;
struct dentry *dir;
struct dentry *dentry;
struct nfs4_state_owner *owner;
@@ -317,8 +316,7 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
const struct nfs_lock_context *l_ctx,
fmode_t fmode);
extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode);
+ struct nfs_fattr *fattr, struct inode *inode);
extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
const nfs4_stateid *deleg_stateid,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index af57332503be..d8b5a250ca05 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1059,31 +1059,15 @@ static void nfs4_session_limit_xasize(struct nfs_server *server)
#endif
}
-static int nfs4_server_common_setup(struct nfs_server *server,
- struct nfs_fh *mntfh, bool auth_probe)
+void nfs4_server_set_init_caps(struct nfs_server *server)
{
- struct nfs_fattr *fattr;
- int error;
-
- /* data servers support only a subset of NFSv4.1 */
- if (is_ds_only_client(server->nfs_client))
- return -EPROTONOSUPPORT;
-
- fattr = nfs_alloc_fattr();
- if (fattr == NULL)
- return -ENOMEM;
-
- /* We must ensure the session is initialised first */
- error = nfs4_init_session(server->nfs_client);
- if (error < 0)
- goto out;
-
/* Set the basic capabilities */
server->caps |= server->nfs_client->cl_mvops->init_caps;
if (server->flags & NFS_MOUNT_NORDIRPLUS)
server->caps &= ~NFS_CAP_READDIRPLUS;
if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
server->caps &= ~NFS_CAP_READ_PLUS;
+
/*
* Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
* authentication.
@@ -1091,7 +1075,23 @@ static int nfs4_server_common_setup(struct nfs_server *server,
if (nfs4_disable_idmapping &&
server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
server->caps |= NFS_CAP_UIDGID_NOMAP;
+}
+static int nfs4_server_common_setup(struct nfs_server *server,
+ struct nfs_fh *mntfh, bool auth_probe)
+{
+ int error;
+
+ /* data servers support only a subset of NFSv4.1 */
+ if (is_ds_only_client(server->nfs_client))
+ return -EPROTONOSUPPORT;
+
+ /* We must ensure the session is initialised first */
+ error = nfs4_init_session(server->nfs_client);
+ if (error < 0)
+ goto out;
+
+ nfs4_server_set_init_caps(server);
/* Probe the root fh to retrieve its FSID and filehandle */
error = nfs4_get_rootfh(server, mntfh, auth_probe);
@@ -1103,7 +1103,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
(unsigned long long) server->fsid.minor);
nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
- error = nfs_probe_fsinfo(server, mntfh, fattr);
+ error = nfs_probe_server(server, mntfh);
if (error < 0)
goto out;
@@ -1117,7 +1117,6 @@ static int nfs4_server_common_setup(struct nfs_server *server,
server->mount_time = jiffies;
server->destroy = nfs4_destroy_server;
out:
- nfs_free_fattr(fattr);
return error;
}
@@ -1288,30 +1287,6 @@ error:
return ERR_PTR(error);
}
-/*
- * Grab the destination's particulars, including lease expiry time.
- *
- * Returns zero if probe succeeded and retrieved FSID matches the FSID
- * we have cached.
- */
-static int nfs_probe_destination(struct nfs_server *server)
-{
- struct inode *inode = d_inode(server->super->s_root);
- struct nfs_fattr *fattr;
- int error;
-
- fattr = nfs_alloc_fattr();
- if (fattr == NULL)
- return -ENOMEM;
-
- /* Sanity: the probe won't work if the destination server
- * does not recognize the migrated FH. */
- error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
-
- nfs_free_fattr(fattr);
- return error;
-}
-
/**
* nfs4_update_server - Move an nfs_server to a different nfs_client
*
@@ -1372,5 +1347,5 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
nfs_server_insert_lists(server);
- return nfs_probe_destination(server);
+ return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root)));
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c91565227ea2..e79ae4cbc395 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -317,7 +317,7 @@ static int read_name_gen = 1;
static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
struct nfs_fh *src_fh, nfs4_stateid *stateid)
{
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
struct file *filep, *res;
struct nfs_server *server;
struct inode *r_ino = NULL;
@@ -328,9 +328,10 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
server = NFS_SERVER(ss_mnt->mnt_root->d_inode);
- nfs_fattr_init(&fattr);
+ if (!fattr)
+ return ERR_PTR(-ENOMEM);
- status = nfs4_proc_getattr(server, src_fh, &fattr, NULL, NULL);
+ status = nfs4_proc_getattr(server, src_fh, fattr, NULL);
if (status < 0) {
res = ERR_PTR(status);
goto out;
@@ -343,20 +344,18 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out;
snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
- r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, &fattr,
- NULL);
+ r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr);
if (IS_ERR(r_ino)) {
res = ERR_CAST(r_ino);
goto out_free_name;
}
- filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, FMODE_READ,
+ filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, O_RDONLY,
r_ino->i_fop);
if (IS_ERR(filep)) {
res = ERR_CAST(filep);
goto out_free_name;
}
- filep->f_mode |= FMODE_READ;
ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode,
filep);
@@ -388,6 +387,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
out_free_name:
kfree(read_name);
out:
+ nfs_free_fattr(fattr);
return res;
out_stateowner:
nfs4_put_state_owner(sp);
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 8d8aba305ecc..f331866dd418 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -487,7 +487,7 @@ nfs_idmap_new(struct nfs_client *clp)
err_destroy_pipe:
rpc_destroy_pipe_data(idmap->idmap_pipe);
err:
- get_user_ns(idmap->user_ns);
+ put_user_ns(idmap->user_ns);
kfree(idmap);
return error;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e1214bb6b7ee..ee3bc79f6ca3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -93,11 +93,11 @@ struct nfs4_opendata;
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode);
static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs_open_context *ctx, struct nfs4_label *ilabel,
- struct nfs4_label *olabel);
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel);
#ifdef CONFIG_NFS_V4_1
static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
const struct cred *cred,
@@ -127,7 +127,8 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
return NULL;
err = security_dentry_init_security(dentry, sattr->ia_mode,
- &dentry->d_name, (void **)&label->label, &label->len);
+ &dentry->d_name, NULL,
+ (void **)&label->label, &label->len);
if (err == 0)
return label;
@@ -1329,7 +1330,6 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
static void nfs4_init_opendata_res(struct nfs4_opendata *p)
{
p->o_res.f_attr = &p->f_attr;
- p->o_res.f_label = p->f_label;
p->o_res.seqid = p->o_arg.seqid;
p->c_res.seqid = p->c_arg.seqid;
p->o_res.server = p->o_arg.server;
@@ -1355,8 +1355,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
if (p == NULL)
goto err;
- p->f_label = nfs4_label_alloc(server, gfp_mask);
- if (IS_ERR(p->f_label))
+ p->f_attr.label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_attr.label))
goto err_free_p;
p->a_label = nfs4_label_alloc(server, gfp_mask);
@@ -1388,27 +1388,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
sizeof(p->o_arg.u.verifier.data));
}
}
- /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
- * will return permission denied for all bits until close */
- if (!(flags & O_EXCL)) {
- /* ask server to check for all possible rights as results
- * are cached */
- switch (p->o_arg.claim) {
- default:
- break;
- case NFS4_OPEN_CLAIM_NULL:
- case NFS4_OPEN_CLAIM_FH:
- p->o_arg.access = NFS4_ACCESS_READ |
- NFS4_ACCESS_MODIFY |
- NFS4_ACCESS_EXTEND |
- NFS4_ACCESS_EXECUTE;
+ /* ask server to check for all possible rights as results
+ * are cached */
+ switch (p->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
+ NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE |
+ NFS4_ACCESS_EXECUTE;
#ifdef CONFIG_NFS_V4_2
- if (server->caps & NFS_CAP_XATTR)
- p->o_arg.access |= NFS4_ACCESS_XAREAD |
- NFS4_ACCESS_XAWRITE |
- NFS4_ACCESS_XALIST;
+ if (!(server->caps & NFS_CAP_XATTR))
+ break;
+ p->o_arg.access |= NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE |
+ NFS4_ACCESS_XALIST;
#endif
- }
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
@@ -1439,7 +1434,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
err_free_label:
nfs4_label_free(p->a_label);
err_free_f:
- nfs4_label_free(p->f_label);
+ nfs4_label_free(p->f_attr.label);
err_free_p:
kfree(p);
err:
@@ -1461,7 +1456,7 @@ static void nfs4_opendata_free(struct kref *kref)
nfs4_put_state_owner(p->owner);
nfs4_label_free(p->a_label);
- nfs4_label_free(p->f_label);
+ nfs4_label_free(p->f_attr.label);
dput(p->dir);
dput(p->dentry);
@@ -1609,15 +1604,16 @@ static bool nfs_stateid_is_sequential(struct nfs4_state *state,
{
if (test_bit(NFS_OPEN_STATE, &state->flags)) {
/* The common case - we're updating to a new sequence number */
- if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
- nfs4_stateid_is_next(&state->open_stateid, stateid)) {
- return true;
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (nfs4_stateid_is_next(&state->open_stateid, stateid))
+ return true;
+ return false;
}
- } else {
- /* This is the first OPEN in this generation */
- if (stateid->seqid == cpu_to_be32(1))
- return true;
+ /* The server returned a new stateid */
}
+ /* This is the first OPEN in this generation */
+ if (stateid->seqid == cpu_to_be32(1))
+ return true;
return false;
}
@@ -2013,7 +2009,7 @@ nfs4_opendata_get_inode(struct nfs4_opendata *data)
if (!(data->f_attr.valid & NFS_ATTR_FATTR))
return ERR_PTR(-EAGAIN);
inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
- &data->f_attr, data->f_label);
+ &data->f_attr);
break;
default:
inode = d_inode(data->dentry);
@@ -2472,11 +2468,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
/* Set the create mode (note dependency on the session type) */
data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
if (data->o_arg.open_flags & O_EXCL) {
- data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
- if (nfs4_has_persistent_session(clp))
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+ if (clp->cl_mvops->minor_version == 0) {
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+ /* don't put an ACCESS op in OPEN compound if O_EXCL,
+ * because ACCESS will return permission denied for
+ * all bits until close */
+ data->o_res.access_request = data->o_arg.access = 0;
+ } else if (nfs4_has_persistent_session(clp))
data->o_arg.createmode = NFS4_CREATE_GUARDED;
- else if (clp->cl_mvops->minor_version > 0)
- data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
}
return;
unlock_no_action:
@@ -2708,8 +2708,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
nfs4_sequence_free_slot(&o_res->seq_res);
- nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr,
- o_res->f_label, NULL);
+ nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL);
}
return 0;
}
@@ -3125,7 +3124,6 @@ static int _nfs4_do_open(struct inode *dir,
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
struct iattr *sattr = c->sattr;
struct nfs4_label *label = c->label;
- struct nfs4_label *olabel = NULL;
int status;
/* Protect against reboot recovery conflicts */
@@ -3148,19 +3146,11 @@ static int _nfs4_do_open(struct inode *dir,
if (opendata == NULL)
goto err_put_state_owner;
- if (label) {
- olabel = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(olabel)) {
- status = PTR_ERR(olabel);
- goto err_opendata_put;
- }
- }
-
if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
if (!opendata->f_attr.mdsthreshold) {
opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
if (!opendata->f_attr.mdsthreshold)
- goto err_free_label;
+ goto err_opendata_put;
}
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
@@ -3169,7 +3159,7 @@ static int _nfs4_do_open(struct inode *dir,
status = _nfs4_open_and_get_state(opendata, flags, ctx);
if (status != 0)
- goto err_free_label;
+ goto err_opendata_put;
state = ctx->state;
if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
@@ -3186,11 +3176,11 @@ static int _nfs4_do_open(struct inode *dir,
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- ctx, label, olabel);
+ ctx, label);
if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr,
opendata->o_res.f_attr);
- nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr);
}
sattr->ia_valid = ia_old;
}
@@ -3203,13 +3193,9 @@ static int _nfs4_do_open(struct inode *dir,
opendata->f_attr.mdsthreshold = NULL;
}
- nfs4_label_free(olabel);
-
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
return 0;
-err_free_label:
- nfs4_label_free(olabel);
err_opendata_put:
nfs4_opendata_put(opendata);
err_put_state_owner:
@@ -3354,8 +3340,7 @@ zero_stateid:
static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs_open_context *ctx, struct nfs4_label *ilabel,
- struct nfs4_label *olabel)
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel)
{
struct nfs_server *server = NFS_SERVER(inode);
__u32 bitmask[NFS4_BITMASK_SZ];
@@ -3369,7 +3354,6 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
};
struct nfs_setattrres res = {
.fattr = fattr,
- .label = olabel,
.server = server,
};
struct nfs4_exception exception = {
@@ -3386,7 +3370,7 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
adjust_flags |= NFS_INO_INVALID_OTHER;
do {
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, olabel),
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label),
inode, adjust_flags);
err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
@@ -3561,7 +3545,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
.stateid = &calldata->arg.stateid,
};
- dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
return;
trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
@@ -3616,7 +3599,7 @@ out_release:
task->tk_status = 0;
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, &calldata->fattr);
- dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
+ dprintk("%s: ret = %d\n", __func__, task->tk_status);
return;
out_restart:
task->tk_status = 0;
@@ -3634,7 +3617,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
bool is_rdonly, is_wronly, is_rdwr;
int call_close = 0;
- dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
goto out_wait;
@@ -3708,7 +3690,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
&calldata->res.seq_res,
task) != 0)
nfs_release_seqid(calldata->arg.seqid);
- dprintk("%s: done!\n", __func__);
return;
out_no_action:
task->tk_action = NULL;
@@ -3941,6 +3922,8 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
.interruptible = true,
};
int err;
+
+ nfs4_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
@@ -4104,7 +4087,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
- struct nfs4_label *label = fattr->label;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -4112,7 +4094,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL);
+ error = nfs4_proc_getattr(server, mntfh, fattr, NULL);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
goto out;
@@ -4175,8 +4157,7 @@ out:
}
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
__u32 bitmask[NFS4_BITMASK_SZ];
struct nfs4_getattr_arg args = {
@@ -4185,7 +4166,6 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
};
struct nfs4_getattr_res res = {
.fattr = fattr,
- .label = label,
.server = server,
};
struct rpc_message msg = {
@@ -4202,7 +4182,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
task_flags |= RPC_TASK_TIMEOUT;
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode, 0);
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), inode, 0);
nfs_fattr_init(fattr);
nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
return nfs4_do_call_sync(server->client, server, &msg,
@@ -4210,15 +4190,14 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
}
int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
struct nfs4_exception exception = {
.interruptible = true,
};
int err;
do {
- err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode);
+ err = _nfs4_proc_getattr(server, fhandle, fattr, inode);
trace_nfs4_getattr(server, fhandle, fattr, err);
err = nfs4_handle_exception(server, err,
&exception);
@@ -4250,7 +4229,6 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct inode *inode = d_inode(dentry);
const struct cred *cred = NULL;
struct nfs_open_context *ctx = NULL;
- struct nfs4_label *label = NULL;
int status;
if (pnfs_ld_layoutret_on_setattr(inode) &&
@@ -4276,26 +4254,21 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
cred = ctx->cred;
}
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (IS_ERR(label))
- return PTR_ERR(label);
-
/* Return any delegations if we're going to change ACLs */
if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
nfs4_inode_make_writeable(inode);
- status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL);
if (status == 0) {
nfs_setattr_update_inode(inode, sattr, fattr);
- nfs_setsecurity(inode, fattr, label);
+ nfs_setsecurity(inode, fattr);
}
- nfs4_label_free(label);
return status;
}
static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct nfs_server *server = NFS_SERVER(dir);
int status;
@@ -4307,7 +4280,6 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
struct nfs4_lookup_res res = {
.server = server,
.fattr = fattr,
- .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -4324,7 +4296,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
if (nfs_lookup_is_soft_revalidate(dentry))
task_flags |= RPC_TASK_TIMEOUT;
- args.bitmask = nfs4_bitmask(server, label);
+ args.bitmask = nfs4_bitmask(server, fattr->label);
nfs_fattr_init(fattr);
@@ -4346,7 +4318,7 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = {
.interruptible = true,
@@ -4355,7 +4327,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
const struct qstr *name = &dentry->d_name;
int err;
do {
- err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr, label);
+ err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr);
trace_nfs4_lookup(dir, name, err);
switch (err) {
case -NFS4ERR_BADNAME:
@@ -4391,13 +4363,12 @@ out:
}
static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
int status;
struct rpc_clnt *client = NFS_CLIENT(dir);
- status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr, label);
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
if (client != NFS_CLIENT(dir)) {
rpc_shutdown_client(client);
nfs_fixup_secinfo_attributes(fattr);
@@ -4412,15 +4383,14 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry,
struct rpc_clnt *client = NFS_CLIENT(dir);
int status;
- status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr, NULL);
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
if (status < 0)
return ERR_PTR(status);
return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
}
static int _nfs4_proc_lookupp(struct inode *inode,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct rpc_clnt *clnt = NFS_CLIENT(inode);
struct nfs_server *server = NFS_SERVER(inode);
@@ -4432,7 +4402,6 @@ static int _nfs4_proc_lookupp(struct inode *inode,
struct nfs4_lookupp_res res = {
.server = server,
.fattr = fattr,
- .label = label,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -4445,7 +4414,7 @@ static int _nfs4_proc_lookupp(struct inode *inode,
if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
task_flags |= RPC_TASK_TIMEOUT;
- args.bitmask = nfs4_bitmask(server, label);
+ args.bitmask = nfs4_bitmask(server, fattr->label);
nfs_fattr_init(fattr);
@@ -4457,14 +4426,14 @@ static int _nfs4_proc_lookupp(struct inode *inode,
}
static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = {
.interruptible = true,
};
int err;
do {
- err = _nfs4_proc_lookupp(inode, fhandle, fattr, label);
+ err = _nfs4_proc_lookupp(inode, fhandle, fattr);
trace_nfs4_lookupp(inode, err);
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
@@ -4791,7 +4760,6 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
};
struct nfs4_link_res res = {
.server = server,
- .label = NULL,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -4800,18 +4768,12 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
};
int status = -ENOMEM;
- res.fattr = nfs_alloc_fattr();
+ res.fattr = nfs_alloc_fattr_with_label(server);
if (res.fattr == NULL)
goto out;
- res.label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(res.label)) {
- status = PTR_ERR(res.label);
- goto out;
- }
-
nfs4_inode_make_writeable(inode);
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.label), inode,
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), inode,
NFS_INO_INVALID_CHANGE);
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
@@ -4820,12 +4782,9 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
nfs4_inc_nlink(inode);
status = nfs_post_op_update_inode(inode, res.fattr);
if (!status)
- nfs_setsecurity(inode, res.fattr, res.label);
+ nfs_setsecurity(inode, res.fattr);
}
-
- nfs4_label_free(res.label);
-
out:
nfs_free_fattr(res.fattr);
return status;
@@ -4851,7 +4810,6 @@ struct nfs4_createdata {
struct nfs4_create_res res;
struct nfs_fh fh;
struct nfs_fattr fattr;
- struct nfs4_label *label;
};
static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -4863,8 +4821,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
if (data != NULL) {
struct nfs_server *server = NFS_SERVER(dir);
- data->label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(data->label))
+ data->fattr.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->fattr.label))
goto out_free;
data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
@@ -4875,12 +4833,11 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.name = name;
data->arg.attrs = sattr;
data->arg.ftype = ftype;
- data->arg.bitmask = nfs4_bitmask(server, data->label);
+ data->arg.bitmask = nfs4_bitmask(server, data->fattr.label);
data->arg.umask = current_umask();
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
- data->res.label = data->label;
nfs_fattr_init(data->res.fattr);
}
return data;
@@ -4902,14 +4859,14 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
data->res.fattr->time_start,
NFS_INO_INVALID_DATA);
spin_unlock(&dir->i_lock);
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
}
return status;
}
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
- nfs4_label_free(data->label);
+ nfs4_label_free(data->fattr.label);
kfree(data);
}
@@ -5347,8 +5304,6 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- dprintk("--> %s\n", __func__);
-
if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
@@ -6004,17 +5959,18 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
size_t buflen)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_fattr fattr;
struct nfs4_label label = {0, 0, buflen, buf};
u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_fattr fattr = {
+ .label = &label,
+ };
struct nfs4_getattr_arg arg = {
.fh = NFS_FH(inode),
.bitmask = bitmask,
};
struct nfs4_getattr_res res = {
.fattr = &fattr,
- .label = &label,
.server = server,
};
struct rpc_message msg = {
@@ -6056,8 +6012,7 @@ static int nfs4_get_security_label(struct inode *inode, void *buf,
static int _nfs4_do_set_security_label(struct inode *inode,
struct nfs4_label *ilabel,
- struct nfs_fattr *fattr,
- struct nfs4_label *olabel)
+ struct nfs_fattr *fattr)
{
struct iattr sattr = {0};
@@ -6072,7 +6027,6 @@ static int _nfs4_do_set_security_label(struct inode *inode,
};
struct nfs_setattrres res = {
.fattr = fattr,
- .label = olabel,
.server = server,
};
struct rpc_message msg = {
@@ -6093,15 +6047,13 @@ static int _nfs4_do_set_security_label(struct inode *inode,
static int nfs4_do_set_security_label(struct inode *inode,
struct nfs4_label *ilabel,
- struct nfs_fattr *fattr,
- struct nfs4_label *olabel)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs4_do_set_security_label(inode, ilabel,
- fattr, olabel);
+ err = _nfs4_do_set_security_label(inode, ilabel, fattr);
trace_nfs4_set_security_label(inode, err);
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
@@ -6112,32 +6064,21 @@ static int nfs4_do_set_security_label(struct inode *inode,
static int
nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
{
- struct nfs4_label ilabel, *olabel = NULL;
- struct nfs_fattr fattr;
+ struct nfs4_label ilabel = {0, 0, buflen, (char *)buf };
+ struct nfs_fattr *fattr;
int status;
if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
return -EOPNOTSUPP;
- nfs_fattr_init(&fattr);
-
- ilabel.pi = 0;
- ilabel.lfs = 0;
- ilabel.label = (char *)buf;
- ilabel.len = buflen;
-
- olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
- if (IS_ERR(olabel)) {
- status = -PTR_ERR(olabel);
- goto out;
- }
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fattr == NULL)
+ return -ENOMEM;
- status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+ status = nfs4_do_set_security_label(inode, &ilabel, fattr);
if (status == 0)
- nfs_setsecurity(inode, &fattr, olabel);
+ nfs_setsecurity(inode, fattr);
- nfs4_label_free(olabel);
-out:
return status;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
@@ -7003,7 +6944,6 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
struct nfs4_lockdata *data = calldata;
struct nfs4_state *state = data->lsp->ls_state;
- dprintk("%s: begin!\n", __func__);
if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
goto out_wait;
/* Do we need to do an open_to_lock_owner? */
@@ -7037,7 +6977,7 @@ out_release_lock_seqid:
nfs_release_seqid(data->arg.lock_seqid);
out_wait:
nfs4_sequence_done(task, &data->res.seq_res);
- dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+ dprintk("%s: ret = %d\n", __func__, data->rpc_status);
}
static void nfs4_lock_done(struct rpc_task *task, void *calldata)
@@ -7045,8 +6985,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
- dprintk("%s: begin!\n", __func__);
-
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -7080,7 +7018,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
goto out_restart;
}
out_done:
- dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
+ dprintk("%s: ret = %d!\n", __func__, data->rpc_status);
return;
out_restart:
if (!data->cancelled)
@@ -7092,7 +7030,6 @@ static void nfs4_lock_release(void *calldata)
{
struct nfs4_lockdata *data = calldata;
- dprintk("%s: begin!\n", __func__);
nfs_free_seqid(data->arg.open_seqid);
if (data->cancelled && data->rpc_status == 0) {
struct rpc_task *task;
@@ -7106,7 +7043,6 @@ static void nfs4_lock_release(void *calldata)
nfs4_put_lock_state(data->lsp);
put_nfs_open_context(data->ctx);
kfree(data);
- dprintk("%s: done!\n", __func__);
}
static const struct rpc_call_ops nfs4_lock_ops = {
@@ -7153,7 +7089,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
if (client->cl_minorversion)
task_setup_data.flags |= RPC_TASK_MOVEABLE;
- dprintk("%s: begin!\n", __func__);
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
fl->fl_u.nfs4_fl.owner,
recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
@@ -7184,7 +7119,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->cancelled = true;
trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
rpc_put_task(task);
- dprintk("%s: done, ret = %d!\n", __func__, ret);
+ dprintk("%s: ret = %d\n", __func__, ret);
return ret;
}
@@ -8855,14 +8790,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
struct nfs4_get_lease_time_data *data =
(struct nfs4_get_lease_time_data *)calldata;
- dprintk("--> %s\n", __func__);
/* just setup sequence, do not trigger session recovery
since we're invoked within one */
nfs4_setup_sequence(data->clp,
&data->args->la_seq_args,
&data->res->lr_seq_res,
task);
- dprintk("<-- %s\n", __func__);
}
/*
@@ -8874,13 +8807,11 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
struct nfs4_get_lease_time_data *data =
(struct nfs4_get_lease_time_data *)calldata;
- dprintk("--> %s\n", __func__);
if (!nfs4_sequence_done(task, &data->res->lr_seq_res))
return;
switch (task->tk_status) {
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
- dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
rpc_delay(task, NFS4_POLL_RETRY_MIN);
task->tk_status = 0;
fallthrough;
@@ -8888,7 +8819,6 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
rpc_restart_call_prepare(task);
return;
}
- dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_get_lease_time_ops = {
@@ -9120,7 +9050,6 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
out:
- dprintk("<-- %s\n", __func__);
return status;
}
@@ -9138,8 +9067,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
};
int status = 0;
- dprintk("--> nfs4_proc_destroy_session\n");
-
/* session is still being setup */
if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
return 0;
@@ -9151,8 +9078,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
if (status)
dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
"Session has been destroyed regardless...\n", status);
-
- dprintk("<-- nfs4_proc_destroy_session\n");
return status;
}
@@ -9200,7 +9125,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
if (task->tk_status < 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
if (refcount_read(&clp->cl_count) == 1)
- goto out;
+ return;
if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
rpc_restart_call_prepare(task);
@@ -9208,8 +9133,6 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
}
}
dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
-out:
- dprintk("<-- %s\n", __func__);
}
static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
@@ -9356,7 +9279,6 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
struct nfs_client *clp = calldata->clp;
struct nfs4_sequence_res *res = &calldata->res.seq_res;
- dprintk("--> %s\n", __func__);
if (!nfs41_sequence_done(task, res))
return;
@@ -9365,7 +9287,6 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
rpc_restart_call_prepare(task);
return;
}
- dprintk("<-- %s\n", __func__);
}
static void nfs4_free_reclaim_complete_data(void *data)
@@ -9400,7 +9321,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
};
int status = -ENOMEM;
- dprintk("--> %s\n", __func__);
calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
if (calldata == NULL)
goto out;
@@ -9423,19 +9343,15 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
- dprintk("--> %s\n", __func__);
nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args,
&lgp->res.seq_res, task);
- dprintk("<-- %s\n", __func__);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- dprintk("--> %s\n", __func__);
nfs41_sequence_process(task, &lgp->res.seq_res);
- dprintk("<-- %s\n", __func__);
}
static int
@@ -9524,7 +9440,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = err;
}
out:
- dprintk("<-- %s\n", __func__);
return status;
}
@@ -9538,10 +9453,8 @@ static void nfs4_layoutget_release(void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- dprintk("--> %s\n", __func__);
nfs4_sequence_free_slot(&lgp->res.seq_res);
pnfs_layoutget_free(lgp);
- dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_layoutget_call_ops = {
@@ -9577,8 +9490,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
};
int status = 0;
- dprintk("--> %s\n", __func__);
-
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
task = rpc_run_task(&task_setup_data);
@@ -9614,7 +9525,6 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
- dprintk("--> %s\n", __func__);
nfs4_setup_sequence(lrp->clp,
&lrp->args.seq_args,
&lrp->res.seq_res,
@@ -9628,8 +9538,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
struct nfs4_layoutreturn *lrp = calldata;
struct nfs_server *server;
- dprintk("--> %s\n", __func__);
-
if (!nfs41_sequence_process(task, &lrp->res.seq_res))
return;
@@ -9660,7 +9568,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
break;
goto out_restart;
}
- dprintk("<-- %s\n", __func__);
return;
out_restart:
task->tk_status = 0;
@@ -9673,7 +9580,6 @@ static void nfs4_layoutreturn_release(void *calldata)
struct nfs4_layoutreturn *lrp = calldata;
struct pnfs_layout_hdr *lo = lrp->args.layout;
- dprintk("--> %s\n", __func__);
pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
lrp->res.lrs_present ? &lrp->res.stateid : NULL);
nfs4_sequence_free_slot(&lrp->res.seq_res);
@@ -9683,7 +9589,6 @@ static void nfs4_layoutreturn_release(void *calldata)
nfs_iput_and_deactive(lrp->inode);
put_cred(lrp->cred);
kfree(calldata);
- dprintk("<-- %s\n", __func__);
}
static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
@@ -9714,7 +9619,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
NFS_SP4_MACH_CRED_PNFS_CLEANUP,
&task_setup_data.rpc_client, &msg);
- dprintk("--> %s\n", __func__);
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
if (!sync) {
if (!lrp->inode) {
@@ -9761,7 +9665,6 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
};
int status;
- dprintk("--> %s\n", __func__);
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (res.notification & ~args.notify_types)
dprintk("%s: unsupported notification\n", __func__);
@@ -9933,7 +9836,6 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
msg.rpc_cred = cred;
}
- dprintk("--> %s\n", __func__);
nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
status = nfs4_call_sync_custom(&task_setup);
dprintk("<-- %s status=%d\n", __func__, status);
@@ -10157,6 +10059,10 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
static void nfs41_free_stateid_release(void *calldata)
{
+ struct nfs_free_stateid_data *data = calldata;
+ struct nfs_client *clp = data->server->nfs_client;
+
+ nfs_put_client(clp);
kfree(calldata);
}
@@ -10193,6 +10099,10 @@ static int nfs41_free_stateid(struct nfs_server *server,
};
struct nfs_free_stateid_data *data;
struct rpc_task *task;
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return -EIO;
nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
&task_setup.rpc_client, &msg);
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 4145a0138907..5db460476bf2 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -511,12 +511,16 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
struct nfs4_slot *slot,
struct nfs4_sequence_res *res)
{
+ u32 target_highest_slotid = min(res->sr_target_highest_slotid,
+ NFS4_MAX_SLOTID);
+ u32 highest_slotid = min(res->sr_highest_slotid, NFS4_MAX_SLOTID);
+
spin_lock(&tbl->slot_tbl_lock);
- if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
- nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
+ if (!nfs41_is_outlier_target_slotid(tbl, target_highest_slotid))
+ nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
if (tbl->generation == slot->generation)
- nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
- nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
+ nfs41_set_server_slotid_locked(tbl, highest_slotid);
+ nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
spin_unlock(&tbl->slot_tbl_lock);
}
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3de425f59b3a..351616c61df5 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -12,6 +12,7 @@
#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_MAX_SLOTID (NFS4_MAX_SLOT_TABLE - 1U)
#define NFS4_NO_SLOT ((u32)-1)
#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f22818a80c2c..ecc4594299d6 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1194,10 +1194,7 @@ static int nfs4_run_state_manager(void *);
static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
{
- smp_mb__before_atomic();
- clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
- smp_mb__after_atomic();
- wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
+ clear_and_wake_up_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
rpc_wake_up(&clp->cl_rpcwaitq);
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 7a2567aa2b86..6ee6ad3674a2 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -9,322 +9,10 @@
#define _TRACE_NFS4_H
#include <linux/tracepoint.h>
+#include <trace/events/sunrpc_base.h>
-TRACE_DEFINE_ENUM(EPERM);
-TRACE_DEFINE_ENUM(ENOENT);
-TRACE_DEFINE_ENUM(EIO);
-TRACE_DEFINE_ENUM(ENXIO);
-TRACE_DEFINE_ENUM(EACCES);
-TRACE_DEFINE_ENUM(EEXIST);
-TRACE_DEFINE_ENUM(EXDEV);
-TRACE_DEFINE_ENUM(ENOTDIR);
-TRACE_DEFINE_ENUM(EISDIR);
-TRACE_DEFINE_ENUM(EFBIG);
-TRACE_DEFINE_ENUM(ENOSPC);
-TRACE_DEFINE_ENUM(EROFS);
-TRACE_DEFINE_ENUM(EMLINK);
-TRACE_DEFINE_ENUM(ENAMETOOLONG);
-TRACE_DEFINE_ENUM(ENOTEMPTY);
-TRACE_DEFINE_ENUM(EDQUOT);
-TRACE_DEFINE_ENUM(ESTALE);
-TRACE_DEFINE_ENUM(EBADHANDLE);
-TRACE_DEFINE_ENUM(EBADCOOKIE);
-TRACE_DEFINE_ENUM(ENOTSUPP);
-TRACE_DEFINE_ENUM(ETOOSMALL);
-TRACE_DEFINE_ENUM(EREMOTEIO);
-TRACE_DEFINE_ENUM(EBADTYPE);
-TRACE_DEFINE_ENUM(EAGAIN);
-TRACE_DEFINE_ENUM(ELOOP);
-TRACE_DEFINE_ENUM(EOPNOTSUPP);
-TRACE_DEFINE_ENUM(EDEADLK);
-TRACE_DEFINE_ENUM(ENOMEM);
-TRACE_DEFINE_ENUM(EKEYEXPIRED);
-TRACE_DEFINE_ENUM(ETIMEDOUT);
-TRACE_DEFINE_ENUM(ERESTARTSYS);
-TRACE_DEFINE_ENUM(ECONNREFUSED);
-TRACE_DEFINE_ENUM(ECONNRESET);
-TRACE_DEFINE_ENUM(ENETUNREACH);
-TRACE_DEFINE_ENUM(EHOSTUNREACH);
-TRACE_DEFINE_ENUM(EHOSTDOWN);
-TRACE_DEFINE_ENUM(EPIPE);
-TRACE_DEFINE_ENUM(EPFNOSUPPORT);
-TRACE_DEFINE_ENUM(EPROTONOSUPPORT);
-
-TRACE_DEFINE_ENUM(NFS4_OK);
-TRACE_DEFINE_ENUM(NFS4ERR_ACCESS);
-TRACE_DEFINE_ENUM(NFS4ERR_ATTRNOTSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_ADMIN_REVOKED);
-TRACE_DEFINE_ENUM(NFS4ERR_BACK_CHAN_BUSY);
-TRACE_DEFINE_ENUM(NFS4ERR_BADCHAR);
-TRACE_DEFINE_ENUM(NFS4ERR_BADHANDLE);
-TRACE_DEFINE_ENUM(NFS4ERR_BADIOMODE);
-TRACE_DEFINE_ENUM(NFS4ERR_BADLAYOUT);
-TRACE_DEFINE_ENUM(NFS4ERR_BADLABEL);
-TRACE_DEFINE_ENUM(NFS4ERR_BADNAME);
-TRACE_DEFINE_ENUM(NFS4ERR_BADOWNER);
-TRACE_DEFINE_ENUM(NFS4ERR_BADSESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_BADSLOT);
-TRACE_DEFINE_ENUM(NFS4ERR_BADTYPE);
-TRACE_DEFINE_ENUM(NFS4ERR_BADXDR);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_COOKIE);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_HIGH_SLOT);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_RANGE);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_SEQID);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_SESSION_DIGEST);
-TRACE_DEFINE_ENUM(NFS4ERR_BAD_STATEID);
-TRACE_DEFINE_ENUM(NFS4ERR_CB_PATH_DOWN);
-TRACE_DEFINE_ENUM(NFS4ERR_CLID_INUSE);
-TRACE_DEFINE_ENUM(NFS4ERR_CLIENTID_BUSY);
-TRACE_DEFINE_ENUM(NFS4ERR_COMPLETE_ALREADY);
-TRACE_DEFINE_ENUM(NFS4ERR_CONN_NOT_BOUND_TO_SESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_DEADLOCK);
-TRACE_DEFINE_ENUM(NFS4ERR_DEADSESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_DELAY);
-TRACE_DEFINE_ENUM(NFS4ERR_DELEG_ALREADY_WANTED);
-TRACE_DEFINE_ENUM(NFS4ERR_DELEG_REVOKED);
-TRACE_DEFINE_ENUM(NFS4ERR_DENIED);
-TRACE_DEFINE_ENUM(NFS4ERR_DIRDELEG_UNAVAIL);
-TRACE_DEFINE_ENUM(NFS4ERR_DQUOT);
-TRACE_DEFINE_ENUM(NFS4ERR_ENCR_ALG_UNSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_EXIST);
-TRACE_DEFINE_ENUM(NFS4ERR_EXPIRED);
-TRACE_DEFINE_ENUM(NFS4ERR_FBIG);
-TRACE_DEFINE_ENUM(NFS4ERR_FHEXPIRED);
-TRACE_DEFINE_ENUM(NFS4ERR_FILE_OPEN);
-TRACE_DEFINE_ENUM(NFS4ERR_GRACE);
-TRACE_DEFINE_ENUM(NFS4ERR_HASH_ALG_UNSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_INVAL);
-TRACE_DEFINE_ENUM(NFS4ERR_IO);
-TRACE_DEFINE_ENUM(NFS4ERR_ISDIR);
-TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTTRYLATER);
-TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTUNAVAILABLE);
-TRACE_DEFINE_ENUM(NFS4ERR_LEASE_MOVED);
-TRACE_DEFINE_ENUM(NFS4ERR_LOCKED);
-TRACE_DEFINE_ENUM(NFS4ERR_LOCKS_HELD);
-TRACE_DEFINE_ENUM(NFS4ERR_LOCK_RANGE);
-TRACE_DEFINE_ENUM(NFS4ERR_MINOR_VERS_MISMATCH);
-TRACE_DEFINE_ENUM(NFS4ERR_MLINK);
-TRACE_DEFINE_ENUM(NFS4ERR_MOVED);
-TRACE_DEFINE_ENUM(NFS4ERR_NAMETOOLONG);
-TRACE_DEFINE_ENUM(NFS4ERR_NOENT);
-TRACE_DEFINE_ENUM(NFS4ERR_NOFILEHANDLE);
-TRACE_DEFINE_ENUM(NFS4ERR_NOMATCHING_LAYOUT);
-TRACE_DEFINE_ENUM(NFS4ERR_NOSPC);
-TRACE_DEFINE_ENUM(NFS4ERR_NOTDIR);
-TRACE_DEFINE_ENUM(NFS4ERR_NOTEMPTY);
-TRACE_DEFINE_ENUM(NFS4ERR_NOTSUPP);
-TRACE_DEFINE_ENUM(NFS4ERR_NOT_ONLY_OP);
-TRACE_DEFINE_ENUM(NFS4ERR_NOT_SAME);
-TRACE_DEFINE_ENUM(NFS4ERR_NO_GRACE);
-TRACE_DEFINE_ENUM(NFS4ERR_NXIO);
-TRACE_DEFINE_ENUM(NFS4ERR_OLD_STATEID);
-TRACE_DEFINE_ENUM(NFS4ERR_OPENMODE);
-TRACE_DEFINE_ENUM(NFS4ERR_OP_ILLEGAL);
-TRACE_DEFINE_ENUM(NFS4ERR_OP_NOT_IN_SESSION);
-TRACE_DEFINE_ENUM(NFS4ERR_PERM);
-TRACE_DEFINE_ENUM(NFS4ERR_PNFS_IO_HOLE);
-TRACE_DEFINE_ENUM(NFS4ERR_PNFS_NO_LAYOUT);
-TRACE_DEFINE_ENUM(NFS4ERR_RECALLCONFLICT);
-TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_BAD);
-TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_CONFLICT);
-TRACE_DEFINE_ENUM(NFS4ERR_REJECT_DELEG);
-TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG);
-TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG_TO_CACHE);
-TRACE_DEFINE_ENUM(NFS4ERR_REQ_TOO_BIG);
-TRACE_DEFINE_ENUM(NFS4ERR_RESOURCE);
-TRACE_DEFINE_ENUM(NFS4ERR_RESTOREFH);
-TRACE_DEFINE_ENUM(NFS4ERR_RETRY_UNCACHED_REP);
-TRACE_DEFINE_ENUM(NFS4ERR_RETURNCONFLICT);
-TRACE_DEFINE_ENUM(NFS4ERR_ROFS);
-TRACE_DEFINE_ENUM(NFS4ERR_SAME);
-TRACE_DEFINE_ENUM(NFS4ERR_SHARE_DENIED);
-TRACE_DEFINE_ENUM(NFS4ERR_SEQUENCE_POS);
-TRACE_DEFINE_ENUM(NFS4ERR_SEQ_FALSE_RETRY);
-TRACE_DEFINE_ENUM(NFS4ERR_SEQ_MISORDERED);
-TRACE_DEFINE_ENUM(NFS4ERR_SERVERFAULT);
-TRACE_DEFINE_ENUM(NFS4ERR_STALE);
-TRACE_DEFINE_ENUM(NFS4ERR_STALE_CLIENTID);
-TRACE_DEFINE_ENUM(NFS4ERR_STALE_STATEID);
-TRACE_DEFINE_ENUM(NFS4ERR_SYMLINK);
-TRACE_DEFINE_ENUM(NFS4ERR_TOOSMALL);
-TRACE_DEFINE_ENUM(NFS4ERR_TOO_MANY_OPS);
-TRACE_DEFINE_ENUM(NFS4ERR_UNKNOWN_LAYOUTTYPE);
-TRACE_DEFINE_ENUM(NFS4ERR_UNSAFE_COMPOUND);
-TRACE_DEFINE_ENUM(NFS4ERR_WRONGSEC);
-TRACE_DEFINE_ENUM(NFS4ERR_WRONG_CRED);
-TRACE_DEFINE_ENUM(NFS4ERR_WRONG_TYPE);
-TRACE_DEFINE_ENUM(NFS4ERR_XDEV);
-
-TRACE_DEFINE_ENUM(NFS4ERR_RESET_TO_MDS);
-TRACE_DEFINE_ENUM(NFS4ERR_RESET_TO_PNFS);
-
-#define show_nfsv4_errors(error) \
- __print_symbolic(error, \
- { NFS4_OK, "OK" }, \
- /* Mapped by nfs4_stat_to_errno() */ \
- { EPERM, "EPERM" }, \
- { ENOENT, "ENOENT" }, \
- { EIO, "EIO" }, \
- { ENXIO, "ENXIO" }, \
- { EACCES, "EACCES" }, \
- { EEXIST, "EEXIST" }, \
- { EXDEV, "EXDEV" }, \
- { ENOTDIR, "ENOTDIR" }, \
- { EISDIR, "EISDIR" }, \
- { EFBIG, "EFBIG" }, \
- { ENOSPC, "ENOSPC" }, \
- { EROFS, "EROFS" }, \
- { EMLINK, "EMLINK" }, \
- { ENAMETOOLONG, "ENAMETOOLONG" }, \
- { ENOTEMPTY, "ENOTEMPTY" }, \
- { EDQUOT, "EDQUOT" }, \
- { ESTALE, "ESTALE" }, \
- { EBADHANDLE, "EBADHANDLE" }, \
- { EBADCOOKIE, "EBADCOOKIE" }, \
- { ENOTSUPP, "ENOTSUPP" }, \
- { ETOOSMALL, "ETOOSMALL" }, \
- { EREMOTEIO, "EREMOTEIO" }, \
- { EBADTYPE, "EBADTYPE" }, \
- { EAGAIN, "EAGAIN" }, \
- { ELOOP, "ELOOP" }, \
- { EOPNOTSUPP, "EOPNOTSUPP" }, \
- { EDEADLK, "EDEADLK" }, \
- /* RPC errors */ \
- { ENOMEM, "ENOMEM" }, \
- { EKEYEXPIRED, "EKEYEXPIRED" }, \
- { ETIMEDOUT, "ETIMEDOUT" }, \
- { ERESTARTSYS, "ERESTARTSYS" }, \
- { ECONNREFUSED, "ECONNREFUSED" }, \
- { ECONNRESET, "ECONNRESET" }, \
- { ENETUNREACH, "ENETUNREACH" }, \
- { EHOSTUNREACH, "EHOSTUNREACH" }, \
- { EHOSTDOWN, "EHOSTDOWN" }, \
- { EPIPE, "EPIPE" }, \
- { EPFNOSUPPORT, "EPFNOSUPPORT" }, \
- { EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
- /* NFSv4 native errors */ \
- { NFS4ERR_ACCESS, "ACCESS" }, \
- { NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
- { NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
- { NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
- { NFS4ERR_BADCHAR, "BADCHAR" }, \
- { NFS4ERR_BADHANDLE, "BADHANDLE" }, \
- { NFS4ERR_BADIOMODE, "BADIOMODE" }, \
- { NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
- { NFS4ERR_BADLABEL, "BADLABEL" }, \
- { NFS4ERR_BADNAME, "BADNAME" }, \
- { NFS4ERR_BADOWNER, "BADOWNER" }, \
- { NFS4ERR_BADSESSION, "BADSESSION" }, \
- { NFS4ERR_BADSLOT, "BADSLOT" }, \
- { NFS4ERR_BADTYPE, "BADTYPE" }, \
- { NFS4ERR_BADXDR, "BADXDR" }, \
- { NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
- { NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
- { NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
- { NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
- { NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
- { NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
- { NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
- { NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
- { NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
- { NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
- { NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
- "CONN_NOT_BOUND_TO_SESSION" }, \
- { NFS4ERR_DEADLOCK, "DEADLOCK" }, \
- { NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
- { NFS4ERR_DELAY, "DELAY" }, \
- { NFS4ERR_DELEG_ALREADY_WANTED, \
- "DELEG_ALREADY_WANTED" }, \
- { NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
- { NFS4ERR_DENIED, "DENIED" }, \
- { NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
- { NFS4ERR_DQUOT, "DQUOT" }, \
- { NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
- { NFS4ERR_EXIST, "EXIST" }, \
- { NFS4ERR_EXPIRED, "EXPIRED" }, \
- { NFS4ERR_FBIG, "FBIG" }, \
- { NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
- { NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
- { NFS4ERR_GRACE, "GRACE" }, \
- { NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
- { NFS4ERR_INVAL, "INVAL" }, \
- { NFS4ERR_IO, "IO" }, \
- { NFS4ERR_ISDIR, "ISDIR" }, \
- { NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
- { NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
- { NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
- { NFS4ERR_LOCKED, "LOCKED" }, \
- { NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
- { NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
- { NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
- { NFS4ERR_MLINK, "MLINK" }, \
- { NFS4ERR_MOVED, "MOVED" }, \
- { NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
- { NFS4ERR_NOENT, "NOENT" }, \
- { NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
- { NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
- { NFS4ERR_NOSPC, "NOSPC" }, \
- { NFS4ERR_NOTDIR, "NOTDIR" }, \
- { NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
- { NFS4ERR_NOTSUPP, "NOTSUPP" }, \
- { NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
- { NFS4ERR_NOT_SAME, "NOT_SAME" }, \
- { NFS4ERR_NO_GRACE, "NO_GRACE" }, \
- { NFS4ERR_NXIO, "NXIO" }, \
- { NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
- { NFS4ERR_OPENMODE, "OPENMODE" }, \
- { NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
- { NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
- { NFS4ERR_PERM, "PERM" }, \
- { NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
- { NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
- { NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
- { NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
- { NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
- { NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
- { NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
- { NFS4ERR_REP_TOO_BIG_TO_CACHE, \
- "REP_TOO_BIG_TO_CACHE" }, \
- { NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
- { NFS4ERR_RESOURCE, "RESOURCE" }, \
- { NFS4ERR_RESTOREFH, "RESTOREFH" }, \
- { NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
- { NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
- { NFS4ERR_ROFS, "ROFS" }, \
- { NFS4ERR_SAME, "SAME" }, \
- { NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
- { NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
- { NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
- { NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
- { NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
- { NFS4ERR_STALE, "STALE" }, \
- { NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
- { NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
- { NFS4ERR_SYMLINK, "SYMLINK" }, \
- { NFS4ERR_TOOSMALL, "TOOSMALL" }, \
- { NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
- { NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
- { NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
- { NFS4ERR_WRONGSEC, "WRONGSEC" }, \
- { NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
- { NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
- { NFS4ERR_XDEV, "XDEV" }, \
- /* ***** Internal to Linux NFS client ***** */ \
- { NFS4ERR_RESET_TO_MDS, "RESET_TO_MDS" }, \
- { NFS4ERR_RESET_TO_PNFS, "RESET_TO_PNFS" })
-
-#define show_open_flags(flags) \
- __print_flags(flags, "|", \
- { O_CREAT, "O_CREAT" }, \
- { O_EXCL, "O_EXCL" }, \
- { O_TRUNC, "O_TRUNC" }, \
- { O_DIRECT, "O_DIRECT" })
-
-#define show_fmode_flags(mode) \
- __print_flags(mode, "|", \
- { ((__force unsigned long)FMODE_READ), "READ" }, \
- { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
- { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+#include <trace/events/fs.h>
+#include <trace/events/nfs.h>
#define show_nfs_fattr_flags(valid) \
__print_flags((unsigned long)valid, "|", \
@@ -365,7 +53,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
TP_printk(
"error=%ld (%s) dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__get_str(dstaddr)
)
);
@@ -389,29 +77,6 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
-#define show_nfs4_sequence_status_flags(status) \
- __print_flags((unsigned long)status, "|", \
- { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
- { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \
- "CB_GSS_CONTEXTS_EXPIRING" }, \
- { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \
- "CB_GSS_CONTEXTS_EXPIRED" }, \
- { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \
- "EXPIRED_ALL_STATE_REVOKED" }, \
- { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \
- "EXPIRED_SOME_STATE_REVOKED" }, \
- { SEQ4_STATUS_ADMIN_STATE_REVOKED, \
- "ADMIN_STATE_REVOKED" }, \
- { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \
- "RECALLABLE_STATE_REVOKED" }, \
- { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \
- { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \
- "RESTART_RECLAIM_NEEDED" }, \
- { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \
- "CB_PATH_DOWN_SESSION" }, \
- { SEQ4_STATUS_BACKCHANNEL_FAULT, \
- "BACKCHANNEL_FAULT" })
-
TRACE_EVENT(nfs4_sequence_done,
TP_PROTO(
const struct nfs4_session *session,
@@ -425,7 +90,7 @@ TRACE_EVENT(nfs4_sequence_done,
__field(unsigned int, seq_nr)
__field(unsigned int, highest_slotid)
__field(unsigned int, target_highest_slotid)
- __field(unsigned int, status_flags)
+ __field(unsigned long, status_flags)
__field(unsigned long, error)
),
@@ -444,16 +109,16 @@ TRACE_EVENT(nfs4_sequence_done,
TP_printk(
"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
"highest_slotid=%u target_highest_slotid=%u "
- "status_flags=%u (%s)",
+ "status_flags=0x%lx (%s)",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->session,
__entry->slot_nr,
__entry->seq_nr,
__entry->highest_slotid,
__entry->target_highest_slotid,
__entry->status_flags,
- show_nfs4_sequence_status_flags(__entry->status_flags)
+ show_nfs4_seq4_status(__entry->status_flags)
)
);
@@ -490,7 +155,7 @@ TRACE_EVENT(nfs4_cb_sequence,
"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
"highest_slotid=%u",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->session,
__entry->slot_nr,
__entry->seq_nr,
@@ -527,7 +192,7 @@ TRACE_EVENT(nfs4_cb_seqid_err,
"error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
"highest_slotid=%u",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->session,
__entry->slot_nr,
__entry->seq_nr,
@@ -535,6 +200,49 @@ TRACE_EVENT(nfs4_cb_seqid_err,
)
);
+TRACE_EVENT(nfs4_cb_offload,
+ TP_PROTO(
+ const struct nfs_fh *cb_fh,
+ const nfs4_stateid *cb_stateid,
+ uint64_t cb_count,
+ int cb_error,
+ int cb_how_stable
+ ),
+
+ TP_ARGS(cb_fh, cb_stateid, cb_count, cb_error,
+ cb_how_stable),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(loff_t, cb_count)
+ __field(int, cb_how)
+ __field(int, cb_stateid_seq)
+ __field(u32, cb_stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = cb_error < 0 ? -cb_error : 0;
+ __entry->fhandle = nfs_fhandle_hash(cb_fh);
+ __entry->cb_stateid_seq =
+ be32_to_cpu(cb_stateid->seqid);
+ __entry->cb_stateid_hash =
+ nfs_stateid_hash(cb_stateid);
+ __entry->cb_count = cb_count;
+ __entry->cb_how = cb_how_stable;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fhandle=0x%08x cb_stateid=%d:0x%08x "
+ "cb_count=%llu cb_how=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fhandle,
+ __entry->cb_stateid_seq, __entry->cb_stateid_hash,
+ __entry->cb_count,
+ show_nfs_stable_how(__entry->cb_how)
+ )
+);
#endif /* CONFIG_NFS_V4_1 */
TRACE_EVENT(nfs4_setup_sequence,
@@ -661,7 +369,7 @@ TRACE_EVENT(nfs4_state_mgr_failed,
"hostname=%s clp state=%s error=%ld (%s) section=%s",
__get_str(hostname),
show_nfs4_clp_state(__entry->state), -__entry->error,
- show_nfsv4_errors(__entry->error), __get_str(section)
+ show_nfs4_status(__entry->error), __get_str(section)
)
)
@@ -694,8 +402,8 @@ TRACE_EVENT(nfs4_xdr_bad_operation,
__entry->expected = expected;
),
- TP_printk(
- "task:%u@%d xid=0x%08x operation=%u, expected=%u",
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x operation=%u, expected=%u",
__entry->task_id, __entry->client_id, __entry->xid,
__entry->op, __entry->expected
)
@@ -729,10 +437,10 @@ DECLARE_EVENT_CLASS(nfs4_xdr_event,
__entry->error = error;
),
- TP_printk(
- "task:%u@%d xid=0x%08x error=%ld (%s) operation=%u",
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x error=%ld (%s) operation=%u",
__entry->task_id, __entry->client_id, __entry->xid,
- -__entry->error, show_nfsv4_errors(__entry->error),
+ -__entry->error, show_nfs4_status(__entry->error),
__entry->op
)
);
@@ -793,8 +501,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
TP_STRUCT__entry(
__field(unsigned long, error)
- __field(unsigned int, flags)
- __field(unsigned int, fmode)
+ __field(unsigned long, flags)
+ __field(unsigned long, fmode)
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
@@ -812,7 +520,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->error = -error;
__entry->flags = flags;
- __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->fmode = (__force unsigned long)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
if (!IS_ERR_OR_NULL(state)) {
inode = state->inode;
@@ -842,15 +550,15 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
),
TP_printk(
- "error=%ld (%s) flags=%d (%s) fmode=%s "
+ "error=%ld (%s) flags=%lu (%s) fmode=%s "
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
"openstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
__entry->flags,
- show_open_flags(__entry->flags),
- show_fmode_flags(__entry->fmode),
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -904,7 +612,7 @@ TRACE_EVENT(nfs4_cached_open,
TP_printk(
"fmode=%s fileid=%02x:%02x:%llu "
"fhandle=0x%08x stateid=%d:0x%08x",
- __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ __entry->fmode ? show_fs_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
@@ -951,8 +659,8 @@ TRACE_EVENT(nfs4_close,
"error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu "
"fhandle=0x%08x openstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
- __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ show_nfs4_status(__entry->error),
+ __entry->fmode ? show_fs_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
@@ -961,24 +669,6 @@ TRACE_EVENT(nfs4_close,
)
);
-TRACE_DEFINE_ENUM(F_GETLK);
-TRACE_DEFINE_ENUM(F_SETLK);
-TRACE_DEFINE_ENUM(F_SETLKW);
-TRACE_DEFINE_ENUM(F_RDLCK);
-TRACE_DEFINE_ENUM(F_WRLCK);
-TRACE_DEFINE_ENUM(F_UNLCK);
-
-#define show_lock_cmd(type) \
- __print_symbolic((int)type, \
- { F_GETLK, "GETLK" }, \
- { F_SETLK, "SETLK" }, \
- { F_SETLKW, "SETLKW" })
-#define show_lock_type(type) \
- __print_symbolic((int)type, \
- { F_RDLCK, "RDLCK" }, \
- { F_WRLCK, "WRLCK" }, \
- { F_UNLCK, "UNLCK" })
-
DECLARE_EVENT_CLASS(nfs4_lock_event,
TP_PROTO(
const struct file_lock *request,
@@ -991,8 +681,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
TP_STRUCT__entry(
__field(unsigned long, error)
- __field(int, cmd)
- __field(char, type)
+ __field(unsigned long, cmd)
+ __field(unsigned long, type)
__field(loff_t, start)
__field(loff_t, end)
__field(dev_t, dev)
@@ -1024,9 +714,9 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
- show_lock_cmd(__entry->cmd),
- show_lock_type(__entry->type),
+ show_nfs4_status(__entry->error),
+ show_fs_fcntl_cmd(__entry->cmd),
+ show_fs_fcntl_lock_type(__entry->type),
(long long)__entry->start,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1061,8 +751,8 @@ TRACE_EVENT(nfs4_set_lock,
TP_STRUCT__entry(
__field(unsigned long, error)
- __field(int, cmd)
- __field(char, type)
+ __field(unsigned long, cmd)
+ __field(unsigned long, type)
__field(loff_t, start)
__field(loff_t, end)
__field(dev_t, dev)
@@ -1100,9 +790,9 @@ TRACE_EVENT(nfs4_set_lock,
"fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x lockstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
- show_lock_cmd(__entry->cmd),
- show_lock_type(__entry->type),
+ show_nfs4_status(__entry->error),
+ show_fs_fcntl_cmd(__entry->cmd),
+ show_fs_fcntl_lock_type(__entry->type),
(long long)__entry->start,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1219,7 +909,7 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_printk(
"fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
- show_fmode_flags(__entry->fmode),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle
@@ -1266,7 +956,7 @@ TRACE_EVENT(nfs4_delegreturn_exit,
"error=%ld (%s) dev=%02x:%02x fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->fhandle,
__entry->stateid_seq, __entry->stateid_hash
@@ -1309,7 +999,7 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1356,7 +1046,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event,
TP_printk(
"error=%ld (%s) name=%02x:%02x:%llu/%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -1403,7 +1093,7 @@ TRACE_EVENT(nfs4_lookupp,
TP_printk(
"error=%ld (%s) inode=%02x:%02x:%llu",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->ino
)
@@ -1442,7 +1132,7 @@ TRACE_EVENT(nfs4_rename,
"error=%ld (%s) oldname=%02x:%02x:%llu/%s "
"newname=%02x:%02x:%llu/%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->olddir,
__get_str(oldname),
@@ -1477,7 +1167,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle
@@ -1535,7 +1225,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1588,7 +1278,7 @@ DECLARE_EVENT_CLASS(nfs4_getattr_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"valid=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1644,7 +1334,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1705,7 +1395,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"stateid=%d:0x%08x dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1754,7 +1444,7 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_printk(
"error=%ld (%s) id=%u name=%s",
- -__entry->error, show_nfsv4_errors(__entry->error),
+ -__entry->error, show_nfs4_status(__entry->error),
__entry->id,
__get_str(name)
)
@@ -1832,7 +1522,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
"offset=%lld count=%u res=%u stateid=%d:0x%08x "
"layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1906,7 +1596,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
"offset=%lld count=%u res=%u stateid=%d:0x%08x "
"layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1970,7 +1660,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"offset=%lld count=%u layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -1990,16 +1680,6 @@ DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
#ifdef CONFIG_NFS_V4_1
DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
-TRACE_DEFINE_ENUM(IOMODE_READ);
-TRACE_DEFINE_ENUM(IOMODE_RW);
-TRACE_DEFINE_ENUM(IOMODE_ANY);
-
-#define show_pnfs_iomode(iomode) \
- __print_symbolic(iomode, \
- { IOMODE_READ, "READ" }, \
- { IOMODE_RW, "RW" }, \
- { IOMODE_ANY, "ANY" })
-
TRACE_EVENT(nfs4_layoutget,
TP_PROTO(
const struct nfs_open_context *ctx,
@@ -2055,11 +1735,11 @@ TRACE_EVENT(nfs4_layoutget,
"iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
"layoutstateid=%d:0x%08x",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- show_pnfs_iomode(__entry->iomode),
+ show_pnfs_layout_iomode(__entry->iomode),
(unsigned long long)__entry->offset,
(unsigned long long)__entry->count,
__entry->stateid_seq, __entry->stateid_hash,
@@ -2153,7 +1833,7 @@ TRACE_EVENT(pnfs_update_layout,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- show_pnfs_iomode(__entry->iomode),
+ show_pnfs_layout_iomode(__entry->iomode),
(unsigned long long)__entry->pos,
(unsigned long long)__entry->count,
__entry->layoutstateid_seq, __entry->layoutstateid_hash,
@@ -2207,7 +1887,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- show_pnfs_iomode(__entry->iomode),
+ show_pnfs_layout_iomode(__entry->iomode),
(unsigned long long)__entry->pos,
(unsigned long long)__entry->count,
__entry->layoutstateid_seq, __entry->layoutstateid_hash,
@@ -2352,7 +2032,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -2408,7 +2088,7 @@ TRACE_EVENT(ff_layout_commit_error,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"offset=%llu count=%u dstaddr=%s",
-__entry->error,
- show_nfsv4_errors(__entry->error),
+ show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
@@ -2417,6 +2097,406 @@ TRACE_EVENT(ff_layout_commit_error,
)
);
+TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA);
+TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
+
+#define show_llseek_mode(what) \
+ __print_symbolic(what, \
+ { NFS4_CONTENT_DATA, "DATA" }, \
+ { NFS4_CONTENT_HOLE, "HOLE" })
+
+#ifdef CONFIG_NFS_V4_2
+TRACE_EVENT(nfs4_llseek,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_seek_args *args,
+ const struct nfs42_seek_res *res,
+ int error
+ ),
+
+ TP_ARGS(inode, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(u32, fileid)
+ __field(dev_t, dev)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(loff_t, offset_s)
+ __field(u32, what)
+ __field(loff_t, offset_r)
+ __field(u32, eof)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = args->sa_fh;
+
+ __entry->fileid = nfsi->fileid;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset_s = args->sa_offset;
+ __entry->stateid_seq =
+ be32_to_cpu(args->sa_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->sa_stateid);
+ __entry->what = args->sa_what;
+ if (error) {
+ __entry->error = -error;
+ __entry->offset_r = 0;
+ __entry->eof = 0;
+ } else {
+ __entry->error = 0;
+ __entry->offset_r = res->sr_offset;
+ __entry->eof = res->sr_eof;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x offset_s=%llu what=%s "
+ "offset_r=%llu eof=%u",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->offset_s,
+ show_llseek_mode(__entry->what),
+ __entry->offset_r,
+ __entry->eof
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_sparse_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_falloc_args *args,
+ int error
+ ),
+
+ TP_ARGS(inode, args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->offset = args->falloc_offset;
+ __entry->len = args->falloc_length;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(args->falloc_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->falloc_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x offset=%llu len=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ (long long)__entry->offset,
+ (long long)__entry->len
+ )
+);
+#define DEFINE_NFS4_SPARSE_EVENT(name) \
+ DEFINE_EVENT(nfs4_sparse_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const struct nfs42_falloc_args *args, \
+ int error \
+ ), \
+ TP_ARGS(inode, args, error))
+DEFINE_NFS4_SPARSE_EVENT(nfs4_fallocate);
+DEFINE_NFS4_SPARSE_EVENT(nfs4_deallocate);
+
+TRACE_EVENT(nfs4_copy,
+ TP_PROTO(
+ const struct inode *src_inode,
+ const struct inode *dst_inode,
+ const struct nfs42_copy_args *args,
+ const struct nfs42_copy_res *res,
+ const struct nl4_server *nss,
+ int error
+ ),
+
+ TP_ARGS(src_inode, dst_inode, args, res, nss, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, src_fhandle)
+ __field(u32, src_fileid)
+ __field(u32, dst_fhandle)
+ __field(u32, dst_fileid)
+ __field(dev_t, src_dev)
+ __field(dev_t, dst_dev)
+ __field(int, src_stateid_seq)
+ __field(u32, src_stateid_hash)
+ __field(int, dst_stateid_seq)
+ __field(u32, dst_stateid_hash)
+ __field(loff_t, src_offset)
+ __field(loff_t, dst_offset)
+ __field(bool, sync)
+ __field(loff_t, len)
+ __field(int, res_stateid_seq)
+ __field(u32, res_stateid_hash)
+ __field(loff_t, res_count)
+ __field(bool, res_sync)
+ __field(bool, res_cons)
+ __field(bool, intra)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *src_nfsi = NFS_I(src_inode);
+ const struct nfs_inode *dst_nfsi = NFS_I(dst_inode);
+
+ __entry->src_fileid = src_nfsi->fileid;
+ __entry->src_dev = src_inode->i_sb->s_dev;
+ __entry->src_fhandle = nfs_fhandle_hash(args->src_fh);
+ __entry->src_offset = args->src_pos;
+ __entry->dst_fileid = dst_nfsi->fileid;
+ __entry->dst_dev = dst_inode->i_sb->s_dev;
+ __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh);
+ __entry->dst_offset = args->dst_pos;
+ __entry->len = args->count;
+ __entry->sync = args->sync;
+ __entry->src_stateid_seq =
+ be32_to_cpu(args->src_stateid.seqid);
+ __entry->src_stateid_hash =
+ nfs_stateid_hash(&args->src_stateid);
+ __entry->dst_stateid_seq =
+ be32_to_cpu(args->dst_stateid.seqid);
+ __entry->dst_stateid_hash =
+ nfs_stateid_hash(&args->dst_stateid);
+ __entry->intra = nss ? 0 : 1;
+ if (error) {
+ __entry->error = -error;
+ __entry->res_stateid_seq = 0;
+ __entry->res_stateid_hash = 0;
+ __entry->res_count = 0;
+ __entry->res_sync = 0;
+ __entry->res_cons = 0;
+ } else {
+ __entry->error = 0;
+ __entry->res_stateid_seq =
+ be32_to_cpu(res->write_res.stateid.seqid);
+ __entry->res_stateid_hash =
+ nfs_stateid_hash(&res->write_res.stateid);
+ __entry->res_count = res->write_res.count;
+ __entry->res_sync = res->synchronous;
+ __entry->res_cons = res->consecutive;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) intra=%d src_fileid=%02x:%02x:%llu "
+ "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu "
+ "dst_fhandle=0x%08x src_stateid=%d:0x%08x "
+ "dst_stateid=%d:0x%08x src_offset=%llu dst_offset=%llu "
+ "len=%llu sync=%d cb_stateid=%d:0x%08x res_sync=%d "
+ "res_cons=%d res_count=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->intra,
+ MAJOR(__entry->src_dev), MINOR(__entry->src_dev),
+ (unsigned long long)__entry->src_fileid,
+ __entry->src_fhandle,
+ MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev),
+ (unsigned long long)__entry->dst_fileid,
+ __entry->dst_fhandle,
+ __entry->src_stateid_seq, __entry->src_stateid_hash,
+ __entry->dst_stateid_seq, __entry->dst_stateid_hash,
+ __entry->src_offset,
+ __entry->dst_offset,
+ __entry->len,
+ __entry->sync,
+ __entry->res_stateid_seq, __entry->res_stateid_hash,
+ __entry->res_sync,
+ __entry->res_cons,
+ __entry->res_count
+ )
+);
+
+TRACE_EVENT(nfs4_clone,
+ TP_PROTO(
+ const struct inode *src_inode,
+ const struct inode *dst_inode,
+ const struct nfs42_clone_args *args,
+ int error
+ ),
+
+ TP_ARGS(src_inode, dst_inode, args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, src_fhandle)
+ __field(u32, src_fileid)
+ __field(u32, dst_fhandle)
+ __field(u32, dst_fileid)
+ __field(dev_t, src_dev)
+ __field(dev_t, dst_dev)
+ __field(loff_t, src_offset)
+ __field(loff_t, dst_offset)
+ __field(int, src_stateid_seq)
+ __field(u32, src_stateid_hash)
+ __field(int, dst_stateid_seq)
+ __field(u32, dst_stateid_hash)
+ __field(loff_t, len)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *src_nfsi = NFS_I(src_inode);
+ const struct nfs_inode *dst_nfsi = NFS_I(dst_inode);
+
+ __entry->src_fileid = src_nfsi->fileid;
+ __entry->src_dev = src_inode->i_sb->s_dev;
+ __entry->src_fhandle = nfs_fhandle_hash(args->src_fh);
+ __entry->src_offset = args->src_offset;
+ __entry->dst_fileid = dst_nfsi->fileid;
+ __entry->dst_dev = dst_inode->i_sb->s_dev;
+ __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh);
+ __entry->dst_offset = args->dst_offset;
+ __entry->len = args->count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->src_stateid_seq =
+ be32_to_cpu(args->src_stateid.seqid);
+ __entry->src_stateid_hash =
+ nfs_stateid_hash(&args->src_stateid);
+ __entry->dst_stateid_seq =
+ be32_to_cpu(args->dst_stateid.seqid);
+ __entry->dst_stateid_hash =
+ nfs_stateid_hash(&args->dst_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) src_fileid=%02x:%02x:%llu "
+ "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu "
+ "dst_fhandle=0x%08x src_stateid=%d:0x%08x "
+ "dst_stateid=%d:0x%08x src_offset=%llu "
+ "dst_offset=%llu len=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->src_dev), MINOR(__entry->src_dev),
+ (unsigned long long)__entry->src_fileid,
+ __entry->src_fhandle,
+ MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev),
+ (unsigned long long)__entry->dst_fileid,
+ __entry->dst_fhandle,
+ __entry->src_stateid_seq, __entry->src_stateid_hash,
+ __entry->dst_stateid_seq, __entry->dst_stateid_hash,
+ __entry->src_offset,
+ __entry->dst_offset,
+ __entry->len
+ )
+);
+
+TRACE_EVENT(nfs4_copy_notify,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_copy_notify_args *args,
+ const struct nfs42_copy_notify_res *res,
+ int error
+ ),
+
+ TP_ARGS(inode, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(u32, fileid)
+ __field(dev_t, dev)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, res_stateid_seq)
+ __field(u32, res_stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->fileid = nfsi->fileid;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(args->cna_src_fh);
+ __entry->stateid_seq =
+ be32_to_cpu(args->cna_src_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->cna_src_stateid);
+ if (error) {
+ __entry->error = -error;
+ __entry->res_stateid_seq = 0;
+ __entry->res_stateid_hash = 0;
+ } else {
+ __entry->error = 0;
+ __entry->res_stateid_seq =
+ be32_to_cpu(res->cnr_stateid.seqid);
+ __entry->res_stateid_hash =
+ nfs_stateid_hash(&res->cnr_stateid);
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x res_stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->res_stateid_seq, __entry->res_stateid_hash
+ )
+);
+
+TRACE_EVENT(nfs4_offload_cancel,
+ TP_PROTO(
+ const struct nfs42_offload_status_args *args,
+ int error
+ ),
+
+ TP_ARGS(args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->fhandle = nfs_fhandle_hash(args->osa_src_fh);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->osa_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->osa_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fhandle=0x%08x stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+#endif /* CONFIG_NFS_V4_2 */
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a8cff19c6f00..69862bf6db00 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3168,20 +3168,23 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
+ ssize_t ret;
+ void *ptr;
+ u32 tmp;
- p = xdr_inline_decode(xdr, 8);
- if (unlikely(!p))
+ if (xdr_stream_decode_u32(xdr, &tmp) < 0)
return -EIO;
- hdr->status = be32_to_cpup(p++);
- hdr->taglen = be32_to_cpup(p);
+ hdr->status = tmp;
- p = xdr_inline_decode(xdr, hdr->taglen + 4);
- if (unlikely(!p))
+ ret = xdr_stream_decode_opaque_inline(xdr, &ptr, NFS4_OPAQUE_LIMIT);
+ if (ret < 0)
+ return -EIO;
+ hdr->taglen = ret;
+ hdr->tag = ptr;
+
+ if (xdr_stream_decode_u32(xdr, &tmp) < 0)
return -EIO;
- hdr->tag = (char *)p;
- p += XDR_QUADLEN(hdr->taglen);
- hdr->nops = be32_to_cpup(p);
+ hdr->nops = tmp;
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
@@ -4582,8 +4585,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
- struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
- const struct nfs_server *server)
+ struct nfs4_fs_locations *fs_loc, const struct nfs_server *server)
{
int status;
umode_t fmode = 0;
@@ -4698,8 +4700,8 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
- if (label) {
- status = decode_attr_security_label(xdr, bitmap, label);
+ if (fattr->label) {
+ status = decode_attr_security_label(xdr, bitmap, fattr->label);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -4712,7 +4714,7 @@ xdr_error:
static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
- struct nfs4_label *label, const struct nfs_server *server)
+ const struct nfs_server *server)
{
unsigned int savep;
uint32_t attrlen,
@@ -4731,8 +4733,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
if (status < 0)
goto xdr_error;
- status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
- label, server);
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
if (status < 0)
goto xdr_error;
@@ -4742,16 +4743,10 @@ xdr_error:
return status;
}
-static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- struct nfs4_label *label, const struct nfs_server *server)
-{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
-}
-
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
const struct nfs_server *server)
{
- return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
}
/*
@@ -5572,20 +5567,9 @@ static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_re
static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
{
- __be32 *p;
- uint32_t bitmap_words;
- unsigned int i;
-
- p = xdr_inline_decode(xdr, 4);
- if (!p)
- return -EIO;
- bitmap_words = be32_to_cpup(p++);
- if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
+ if (xdr_stream_decode_uint32_array(xdr, op_map->u.words,
+ ARRAY_SIZE(op_map->u.words)) < 0)
return -EIO;
- p = xdr_inline_decode(xdr, 4 * bitmap_words);
- for (i = 0; i < bitmap_words; i++)
- op_map->u.words[i] = be32_to_cpup(p++);
-
return 0;
}
@@ -6179,7 +6163,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6209,7 +6193,7 @@ static int nfs4_xdr_dec_lookupp(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6236,8 +6220,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
goto out;
status = decode_getfh(xdr, res->fh);
if (status == 0)
- status = decode_getfattr_label(xdr, res->fattr,
- res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6331,7 +6314,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_restorefh(xdr);
if (status)
goto out;
- decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6361,7 +6344,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_getfh(xdr, res->fh);
if (status)
goto out;
- decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6394,7 +6377,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ status = decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6532,7 +6515,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
goto out;
if (res->access_request)
decode_access(xdr, &res->access_supported, &res->access_result);
- decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
+ decode_getfattr(xdr, res->f_attr, res->server);
if (res->lg_res)
decode_layoutget(xdr, rqstp, res->lg_res);
out:
@@ -6616,7 +6599,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
status = decode_setattr(xdr);
if (status)
goto out;
- decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -7031,7 +7014,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
status = decode_getfattr_generic(xdr,
&res->fs_locations->fattr,
NULL, res->fs_locations,
- NULL, res->fs_locations->server);
+ res->fs_locations->server);
if (status)
goto out;
if (res->renew)
@@ -7044,7 +7027,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
status = decode_getfattr_generic(xdr,
&res->fs_locations->fattr,
NULL, res->fs_locations,
- NULL, res->fs_locations->server);
+ res->fs_locations->server);
}
out:
return status;
@@ -7475,7 +7458,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return -EAGAIN;
if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
- NULL, entry->label, entry->server) < 0)
+ NULL, entry->server) < 0)
return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8a224871be74..21dac847f1e4 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -11,45 +11,9 @@
#include <linux/tracepoint.h>
#include <linux/iversion.h>
-TRACE_DEFINE_ENUM(DT_UNKNOWN);
-TRACE_DEFINE_ENUM(DT_FIFO);
-TRACE_DEFINE_ENUM(DT_CHR);
-TRACE_DEFINE_ENUM(DT_DIR);
-TRACE_DEFINE_ENUM(DT_BLK);
-TRACE_DEFINE_ENUM(DT_REG);
-TRACE_DEFINE_ENUM(DT_LNK);
-TRACE_DEFINE_ENUM(DT_SOCK);
-TRACE_DEFINE_ENUM(DT_WHT);
-
-#define nfs_show_file_type(ftype) \
- __print_symbolic(ftype, \
- { DT_UNKNOWN, "UNKNOWN" }, \
- { DT_FIFO, "FIFO" }, \
- { DT_CHR, "CHR" }, \
- { DT_DIR, "DIR" }, \
- { DT_BLK, "BLK" }, \
- { DT_REG, "REG" }, \
- { DT_LNK, "LNK" }, \
- { DT_SOCK, "SOCK" }, \
- { DT_WHT, "WHT" })
-
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_DATA);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_ATIME);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACCESS);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACL);
-TRACE_DEFINE_ENUM(NFS_INO_REVAL_PAGECACHE);
-TRACE_DEFINE_ENUM(NFS_INO_REVAL_FORCED);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_LABEL);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_CHANGE);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
-TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
-TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
+#include <trace/events/fs.h>
+#include <trace/events/nfs.h>
+#include <trace/events/sunrpc_base.h>
#define nfs_show_cache_validity(v) \
__print_flags(v, "|", \
@@ -71,17 +35,6 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
{ NFS_INO_INVALID_MODE, "INVALID_MODE" })
-TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
-TRACE_DEFINE_ENUM(NFS_INO_STALE);
-TRACE_DEFINE_ENUM(NFS_INO_ACL_LRU_SET);
-TRACE_DEFINE_ENUM(NFS_INO_INVALIDATING);
-TRACE_DEFINE_ENUM(NFS_INO_FSCACHE);
-TRACE_DEFINE_ENUM(NFS_INO_FSCACHE_LOCK);
-TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMIT);
-TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMITTING);
-TRACE_DEFINE_ENUM(NFS_INO_LAYOUTSTATS);
-TRACE_DEFINE_ENUM(NFS_INO_ODIRECT);
-
#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
{ BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
@@ -163,12 +116,12 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
"type=%u (%s) version=%llu size=%lld "
"cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
__entry->type,
- nfs_show_file_type(__entry->type),
+ show_fs_dirent_type(__entry->type),
(unsigned long long)__entry->version,
(long long)__entry->size,
__entry->cache_validity,
@@ -254,12 +207,12 @@ TRACE_EVENT(nfs_access_exit,
"type=%u (%s) version=%llu size=%lld "
"cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s) "
"mask=0x%x permitted=0x%x",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
__entry->type,
- nfs_show_file_type(__entry->type),
+ show_fs_dirent_type(__entry->type),
(unsigned long long)__entry->version,
(long long)__entry->size,
__entry->cache_validity,
@@ -270,33 +223,55 @@ TRACE_EVENT(nfs_access_exit,
)
);
-TRACE_DEFINE_ENUM(LOOKUP_FOLLOW);
-TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY);
-TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
-TRACE_DEFINE_ENUM(LOOKUP_PARENT);
-TRACE_DEFINE_ENUM(LOOKUP_REVAL);
-TRACE_DEFINE_ENUM(LOOKUP_RCU);
-TRACE_DEFINE_ENUM(LOOKUP_OPEN);
-TRACE_DEFINE_ENUM(LOOKUP_CREATE);
-TRACE_DEFINE_ENUM(LOOKUP_EXCL);
-TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
-TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
-TRACE_DEFINE_ENUM(LOOKUP_DOWN);
-
-#define show_lookup_flags(flags) \
- __print_flags(flags, "|", \
- { LOOKUP_FOLLOW, "FOLLOW" }, \
- { LOOKUP_DIRECTORY, "DIRECTORY" }, \
- { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
- { LOOKUP_PARENT, "PARENT" }, \
- { LOOKUP_REVAL, "REVAL" }, \
- { LOOKUP_RCU, "RCU" }, \
- { LOOKUP_OPEN, "OPEN" }, \
- { LOOKUP_CREATE, "CREATE" }, \
- { LOOKUP_EXCL, "EXCL" }, \
- { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
- { LOOKUP_EMPTY, "EMPTY" }, \
- { LOOKUP_DOWN, "DOWN" })
+DECLARE_EVENT_CLASS(nfs_update_size_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t new_size
+ ),
+
+ TP_ARGS(inode, new_size),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, cur_size)
+ __field(loff_t, new_size)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->cur_size = i_size_read(inode);
+ __entry->new_size = new_size;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cursize=%lld newsize=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->cur_size, __entry->new_size
+ )
+);
+
+#define DEFINE_NFS_UPDATE_SIZE_EVENT(name) \
+ DEFINE_EVENT(nfs_update_size_class, nfs_size_##name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t new_size \
+ ), \
+ TP_ARGS(inode, new_size))
+
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
+DEFINE_NFS_UPDATE_SIZE_EVENT(update);
+DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
DECLARE_EVENT_CLASS(nfs_lookup_event,
TP_PROTO(
@@ -324,7 +299,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
TP_printk(
"flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
__entry->flags,
- show_lookup_flags(__entry->flags),
+ show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -368,9 +343,9 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
TP_printk(
"error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
__entry->flags,
- show_lookup_flags(__entry->flags),
+ show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -392,46 +367,6 @@ DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
-TRACE_DEFINE_ENUM(O_WRONLY);
-TRACE_DEFINE_ENUM(O_RDWR);
-TRACE_DEFINE_ENUM(O_CREAT);
-TRACE_DEFINE_ENUM(O_EXCL);
-TRACE_DEFINE_ENUM(O_NOCTTY);
-TRACE_DEFINE_ENUM(O_TRUNC);
-TRACE_DEFINE_ENUM(O_APPEND);
-TRACE_DEFINE_ENUM(O_NONBLOCK);
-TRACE_DEFINE_ENUM(O_DSYNC);
-TRACE_DEFINE_ENUM(O_DIRECT);
-TRACE_DEFINE_ENUM(O_LARGEFILE);
-TRACE_DEFINE_ENUM(O_DIRECTORY);
-TRACE_DEFINE_ENUM(O_NOFOLLOW);
-TRACE_DEFINE_ENUM(O_NOATIME);
-TRACE_DEFINE_ENUM(O_CLOEXEC);
-
-#define show_open_flags(flags) \
- __print_flags(flags, "|", \
- { O_WRONLY, "O_WRONLY" }, \
- { O_RDWR, "O_RDWR" }, \
- { O_CREAT, "O_CREAT" }, \
- { O_EXCL, "O_EXCL" }, \
- { O_NOCTTY, "O_NOCTTY" }, \
- { O_TRUNC, "O_TRUNC" }, \
- { O_APPEND, "O_APPEND" }, \
- { O_NONBLOCK, "O_NONBLOCK" }, \
- { O_DSYNC, "O_DSYNC" }, \
- { O_DIRECT, "O_DIRECT" }, \
- { O_LARGEFILE, "O_LARGEFILE" }, \
- { O_DIRECTORY, "O_DIRECTORY" }, \
- { O_NOFOLLOW, "O_NOFOLLOW" }, \
- { O_NOATIME, "O_NOATIME" }, \
- { O_CLOEXEC, "O_CLOEXEC" })
-
-#define show_fmode_flags(mode) \
- __print_flags(mode, "|", \
- { ((__force unsigned long)FMODE_READ), "READ" }, \
- { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
- { ((__force unsigned long)FMODE_EXEC), "EXEC" })
-
TRACE_EVENT(nfs_atomic_open_enter,
TP_PROTO(
const struct inode *dir,
@@ -443,7 +378,7 @@ TRACE_EVENT(nfs_atomic_open_enter,
TP_STRUCT__entry(
__field(unsigned long, flags)
- __field(unsigned int, fmode)
+ __field(unsigned long, fmode)
__field(dev_t, dev)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
@@ -453,15 +388,15 @@ TRACE_EVENT(nfs_atomic_open_enter,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->fmode = (__force unsigned long)ctx->mode;
__assign_str(name, ctx->dentry->d_name.name);
),
TP_printk(
"flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s",
__entry->flags,
- show_open_flags(__entry->flags),
- show_fmode_flags(__entry->fmode),
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -481,7 +416,7 @@ TRACE_EVENT(nfs_atomic_open_exit,
TP_STRUCT__entry(
__field(unsigned long, error)
__field(unsigned long, flags)
- __field(unsigned int, fmode)
+ __field(unsigned long, fmode)
__field(dev_t, dev)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
@@ -492,17 +427,17 @@ TRACE_EVENT(nfs_atomic_open_exit,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
- __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->fmode = (__force unsigned long)ctx->mode;
__assign_str(name, ctx->dentry->d_name.name);
),
TP_printk(
"error=%ld (%s) flags=0x%lx (%s) fmode=%s "
"name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
__entry->flags,
- show_open_flags(__entry->flags),
- show_fmode_flags(__entry->fmode),
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -535,7 +470,7 @@ TRACE_EVENT(nfs_create_enter,
TP_printk(
"flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
__entry->flags,
- show_open_flags(__entry->flags),
+ show_fs_fcntl_open_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -570,9 +505,9 @@ TRACE_EVENT(nfs_create_exit,
TP_printk(
"error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
__entry->flags,
- show_open_flags(__entry->flags),
+ show_fs_fcntl_open_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -640,7 +575,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done,
TP_printk(
"error=%ld (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
@@ -730,7 +665,7 @@ TRACE_EVENT(nfs_link_exit,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->fileid,
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -817,7 +752,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
TP_printk(
"error=%ld (%s) old_name=%02x:%02x:%llu/%s "
"new_name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->old_dir,
__get_str(old_name),
@@ -871,13 +806,163 @@ TRACE_EVENT(nfs_sillyrename_unlink,
TP_printk(
"error=%ld (%s) name=%02x:%02x:%llu/%s",
- -__entry->error, nfs_show_status(__entry->error),
+ -__entry->error, show_nfs_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
__get_str(name)
)
);
+TRACE_EVENT(nfs_aop_readpage,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page
+ ),
+
+ TP_ARGS(inode, page),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset
+ )
+);
+
+TRACE_EVENT(nfs_aop_readpage_done,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page,
+ int ret
+ ),
+
+ TP_ARGS(inode, page, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, ret)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld ret=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->ret
+ )
+);
+
+TRACE_EVENT(nfs_aop_readahead,
+ TP_PROTO(
+ const struct inode *inode,
+ struct page *page,
+ unsigned int nr_pages
+ ),
+
+ TP_ARGS(inode, page, nr_pages),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(unsigned int, nr_pages)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->nr_pages = nr_pages;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld nr_pages=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->nr_pages
+ )
+);
+
+TRACE_EVENT(nfs_aop_readahead_done,
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int nr_pages,
+ int ret
+ ),
+
+ TP_ARGS(inode, nr_pages, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, ret)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(unsigned int, nr_pages)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->nr_pages = nr_pages;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu nr_pages=%u ret=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->nr_pages, __entry->ret
+ )
+);
+
TRACE_EVENT(nfs_initiate_read,
TP_PROTO(
const struct nfs_pgio_header *hdr
@@ -1054,16 +1139,6 @@ TRACE_EVENT(nfs_pgio_error,
)
);
-TRACE_DEFINE_ENUM(NFS_UNSTABLE);
-TRACE_DEFINE_ENUM(NFS_DATA_SYNC);
-TRACE_DEFINE_ENUM(NFS_FILE_SYNC);
-
-#define nfs_show_stable(stable) \
- __print_symbolic(stable, \
- { NFS_UNSTABLE, "UNSTABLE" }, \
- { NFS_DATA_SYNC, "DATA_SYNC" }, \
- { NFS_FILE_SYNC, "FILE_SYNC" })
-
TRACE_EVENT(nfs_initiate_write,
TP_PROTO(
const struct nfs_pgio_header *hdr
@@ -1077,7 +1152,7 @@ TRACE_EVENT(nfs_initiate_write,
__field(u64, fileid)
__field(loff_t, offset)
__field(u32, count)
- __field(enum nfs3_stable_how, stable)
+ __field(unsigned long, stable)
),
TP_fast_assign(
@@ -1101,7 +1176,7 @@ TRACE_EVENT(nfs_initiate_write,
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->count,
- nfs_show_stable(__entry->stable)
+ show_nfs_stable_how(__entry->stable)
)
);
@@ -1121,7 +1196,7 @@ TRACE_EVENT(nfs_writeback_done,
__field(u32, arg_count)
__field(u32, res_count)
__field(int, status)
- __field(enum nfs3_stable_how, stable)
+ __field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1154,8 +1229,8 @@ TRACE_EVENT(nfs_writeback_done,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
__entry->res_count, __entry->status,
- nfs_show_stable(__entry->stable),
- __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ show_nfs_stable_how(__entry->stable),
+ show_nfs4_verifier(__entry->verifier)
)
);
@@ -1256,7 +1331,7 @@ TRACE_EVENT(nfs_commit_done,
__field(u64, fileid)
__field(loff_t, offset)
__field(int, status)
- __field(enum nfs3_stable_how, stable)
+ __field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1285,8 +1360,8 @@ TRACE_EVENT(nfs_commit_done,
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->status,
- nfs_show_stable(__entry->stable),
- __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ show_nfs_stable_how(__entry->stable),
+ show_nfs4_verifier(__entry->verifier)
)
);
@@ -1323,76 +1398,6 @@ TRACE_EVENT(nfs_fh_to_dentry,
)
);
-TRACE_DEFINE_ENUM(NFS_OK);
-TRACE_DEFINE_ENUM(NFSERR_PERM);
-TRACE_DEFINE_ENUM(NFSERR_NOENT);
-TRACE_DEFINE_ENUM(NFSERR_IO);
-TRACE_DEFINE_ENUM(NFSERR_NXIO);
-TRACE_DEFINE_ENUM(ECHILD);
-TRACE_DEFINE_ENUM(NFSERR_EAGAIN);
-TRACE_DEFINE_ENUM(NFSERR_ACCES);
-TRACE_DEFINE_ENUM(NFSERR_EXIST);
-TRACE_DEFINE_ENUM(NFSERR_XDEV);
-TRACE_DEFINE_ENUM(NFSERR_NODEV);
-TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
-TRACE_DEFINE_ENUM(NFSERR_ISDIR);
-TRACE_DEFINE_ENUM(NFSERR_INVAL);
-TRACE_DEFINE_ENUM(NFSERR_FBIG);
-TRACE_DEFINE_ENUM(NFSERR_NOSPC);
-TRACE_DEFINE_ENUM(NFSERR_ROFS);
-TRACE_DEFINE_ENUM(NFSERR_MLINK);
-TRACE_DEFINE_ENUM(NFSERR_OPNOTSUPP);
-TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
-TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
-TRACE_DEFINE_ENUM(NFSERR_DQUOT);
-TRACE_DEFINE_ENUM(NFSERR_STALE);
-TRACE_DEFINE_ENUM(NFSERR_REMOTE);
-TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
-TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
-TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
-TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
-TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
-TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
-TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
-TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
-TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
-
-#define nfs_show_status(x) \
- __print_symbolic(x, \
- { NFS_OK, "OK" }, \
- { NFSERR_PERM, "PERM" }, \
- { NFSERR_NOENT, "NOENT" }, \
- { NFSERR_IO, "IO" }, \
- { NFSERR_NXIO, "NXIO" }, \
- { ECHILD, "CHILD" }, \
- { NFSERR_EAGAIN, "AGAIN" }, \
- { NFSERR_ACCES, "ACCES" }, \
- { NFSERR_EXIST, "EXIST" }, \
- { NFSERR_XDEV, "XDEV" }, \
- { NFSERR_NODEV, "NODEV" }, \
- { NFSERR_NOTDIR, "NOTDIR" }, \
- { NFSERR_ISDIR, "ISDIR" }, \
- { NFSERR_INVAL, "INVAL" }, \
- { NFSERR_FBIG, "FBIG" }, \
- { NFSERR_NOSPC, "NOSPC" }, \
- { NFSERR_ROFS, "ROFS" }, \
- { NFSERR_MLINK, "MLINK" }, \
- { NFSERR_OPNOTSUPP, "OPNOTSUPP" }, \
- { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
- { NFSERR_NOTEMPTY, "NOTEMPTY" }, \
- { NFSERR_DQUOT, "DQUOT" }, \
- { NFSERR_STALE, "STALE" }, \
- { NFSERR_REMOTE, "REMOTE" }, \
- { NFSERR_WFLUSH, "WFLUSH" }, \
- { NFSERR_BADHANDLE, "BADHANDLE" }, \
- { NFSERR_NOT_SYNC, "NOTSYNC" }, \
- { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
- { NFSERR_NOTSUPP, "NOTSUPP" }, \
- { NFSERR_TOOSMALL, "TOOSMALL" }, \
- { NFSERR_SERVERFAULT, "REMOTEIO" }, \
- { NFSERR_BADTYPE, "BADTYPE" }, \
- { NFSERR_JUKEBOX, "JUKEBOX" })
-
DECLARE_EVENT_CLASS(nfs_xdr_event,
TP_PROTO(
const struct xdr_stream *xdr,
@@ -1427,12 +1432,12 @@ DECLARE_EVENT_CLASS(nfs_xdr_event,
__assign_str(procedure, task->tk_msg.rpc_proc->p_name);
),
- TP_printk(
- "task:%u@%d xid=0x%08x %sv%d %s error=%ld (%s)",
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x %sv%d %s error=%ld (%s)",
__entry->task_id, __entry->client_id, __entry->xid,
__get_str(program), __entry->version,
__get_str(procedure), -__entry->error,
- nfs_show_status(__entry->error)
+ show_nfs_status(__entry->error)
)
);
#define DEFINE_NFS_XDR_EVENT(name) \
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index cc232d1f16f2..ad7f83dc9a2d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -271,8 +271,7 @@ nfs_page_set_headlock(struct nfs_page *req)
void
nfs_page_clear_headlock(struct nfs_page *req)
{
- smp_mb__before_atomic();
- clear_bit(PG_HEADLOCK, &req->wb_flags);
+ clear_bit_unlock(PG_HEADLOCK, &req->wb_flags);
smp_mb__after_atomic();
if (!test_bit(PG_CONTENDED1, &req->wb_flags))
return;
@@ -525,12 +524,7 @@ nfs_create_subreq(struct nfs_page *req,
*/
void nfs_unlock_request(struct nfs_page *req)
{
- if (!NFS_WBACK_BUSY(req)) {
- printk(KERN_ERR "NFS: Invalid unlock attempted\n");
- BUG();
- }
- smp_mb__before_atomic();
- clear_bit(PG_BUSY, &req->wb_flags);
+ clear_bit_unlock(PG_BUSY, &req->wb_flags);
smp_mb__after_atomic();
if (!test_bit(PG_CONTENDED2, &req->wb_flags))
return;
@@ -870,9 +864,6 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
struct nfs_pgio_header *hdr = calldata;
struct inode *inode = hdr->inode;
- dprintk("NFS: %s: %5u, (status %d)\n", __func__,
- task->tk_pid, task->tk_status);
-
if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
return;
if (task->tk_status < 0)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d810ae674f4e..f4d7548d67b2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -82,10 +82,6 @@ enum pnfs_try_status {
PNFS_TRY_AGAIN = 2,
};
-/* error codes for internal use */
-#define NFS4ERR_RESET_TO_MDS 12001
-#define NFS4ERR_RESET_TO_PNFS 12002
-
#ifdef CONFIG_NFS_V4_1
#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -517,7 +513,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (!lseg || !fl_cinfo->ops->mark_request_commit)
+ if (!lseg || !fl_cinfo->ops || !fl_cinfo->ops->mark_request_commit)
return false;
fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
return true;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index cf19914fec81..316f68f96e57 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -468,7 +468,6 @@ pnfs_bucket_alloc_ds_commits(struct list_head *list,
goto out_error;
data->ds_commit_index = i;
list_add_tail(&data->list, list);
- atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
@@ -520,7 +519,6 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
data->ds_commit_index = -1;
list_splice_init(mds_pages, &data->pages);
list_add_tail(&data->list, &list);
- atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
@@ -895,7 +893,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
}
smp_wmb();
- ds->ds_clp = clp;
+ WRITE_ONCE(ds->ds_clp, clp);
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
@@ -973,7 +971,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
}
smp_wmb();
- ds->ds_clp = clp;
+ WRITE_ONCE(ds->ds_clp, clp);
dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
out:
return status;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ea19dbf12301..73dcaa99fa9b 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -91,7 +91,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
info->dtpref = fsinfo.tsize;
info->maxfilesize = 0x7FFFFFFF;
info->lease_time = 0;
- info->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
+ info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
return 0;
}
@@ -100,8 +100,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
*/
static int
nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr, struct nfs4_label *label,
- struct inode *inode)
+ struct nfs_fattr *fattr, struct inode *inode)
{
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -154,8 +153,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
static int
nfs_proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nfs_fh *fhandle, struct nfs_fattr *fattr,
- struct nfs4_label *label)
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
@@ -257,7 +255,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
nfs_free_createdata(data);
out:
dprintk("NFS reply create: %d\n", status);
@@ -304,7 +302,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
nfs_free_createdata(data);
out:
dprintk("NFS reply mknod: %d\n", status);
@@ -436,7 +434,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
* should fill in the data with a LOOKUP call on the wire.
*/
if (status == 0)
- status = nfs_instantiate(dentry, fh, fattr, NULL);
+ status = nfs_instantiate(dentry, fh, fattr);
out_free:
nfs_free_fattr(fattr);
@@ -465,7 +463,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 08d6cc57cbc3..d11af2a9299c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -337,8 +337,7 @@ int nfs_readpage(struct file *file, struct page *page)
struct inode *inode = page_file_mapping(page)->host;
int ret;
- dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_SIZE, page_index(page));
+ trace_nfs_aop_readpage(inode, page);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
/*
@@ -390,9 +389,11 @@ out_wait:
}
out:
put_nfs_open_context(desc.ctx);
+ trace_nfs_aop_readpage_done(inode, page, ret);
return ret;
out_unlock:
unlock_page(page);
+ trace_nfs_aop_readpage_done(inode, page, ret);
return ret;
}
@@ -403,10 +404,7 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
int ret;
- dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
- inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode),
- nr_pages);
+ trace_nfs_aop_readahead(inode, lru_to_page(pages), nr_pages);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
ret = -ESTALE;
@@ -439,6 +437,7 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
read_complete:
put_nfs_open_context(desc.ctx);
out:
+ trace_nfs_aop_readahead_done(inode, nr_pages, ret);
return ret;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e65c83494c05..3aced401735c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1004,6 +1004,7 @@ int nfs_reconfigure(struct fs_context *fc)
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct super_block *sb = fc->root->d_sb;
struct nfs_server *nfss = sb->s_fs_info;
+ int ret;
sync_filesystem(sb);
@@ -1028,7 +1029,11 @@ int nfs_reconfigure(struct fs_context *fc)
}
/* compare new mount options with old ones */
- return nfs_compare_remount_data(nfss, ctx);
+ ret = nfs_compare_remount_data(nfss, ctx);
+ if (ret)
+ return ret;
+
+ return nfs_probe_server(nfss, NFS_FH(d_inode(fc->root)));
}
EXPORT_SYMBOL_GPL(nfs_reconfigure);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index eae9bf114041..9b7619ce17a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -288,6 +288,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
end = page_file_offset(page) + ((loff_t)offset+count);
if (i_size >= end)
goto out;
+ trace_nfs_size_grow(inode, end);
i_size_write(inode, end);
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
@@ -1038,25 +1039,11 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_page *req, *tmp;
int ret = 0;
-restart:
list_for_each_entry_safe(req, tmp, src, wb_list) {
kref_get(&req->wb_kref);
if (!nfs_lock_request(req)) {
- int status;
-
- /* Prevent deadlock with nfs_lock_and_join_requests */
- if (!list_empty(dst)) {
- nfs_release_request(req);
- continue;
- }
- /* Ensure we make progress to prevent livelock */
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- status = nfs_wait_on_request(req);
nfs_release_request(req);
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- if (status < 0)
- break;
- goto restart;
+ continue;
}
nfs_request_remove_commit_list(req, cinfo);
clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -1246,7 +1233,7 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (nfs_ctx_key_to_expire(ctx, inode) &&
- !ctx->ll_cred)
+ !rcu_access_pointer(ctx->ll_cred))
/* Already expired! */
return -EACCES;
return 0;
@@ -1258,23 +1245,38 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{
struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
- struct rpc_cred *cred = ctx->ll_cred;
+ struct rpc_cred *cred, *new, *old = NULL;
struct auth_cred acred = {
.cred = ctx->cred,
};
+ bool ret = false;
- if (cred && !cred->cr_ops->crmatch(&acred, cred, 0)) {
- put_rpccred(cred);
- ctx->ll_cred = NULL;
- cred = NULL;
- }
- if (!cred)
- cred = auth->au_ops->lookup_cred(auth, &acred, 0);
- if (!cred || IS_ERR(cred))
+ rcu_read_lock();
+ cred = rcu_dereference(ctx->ll_cred);
+ if (cred && !(cred->cr_ops->crkey_timeout &&
+ cred->cr_ops->crkey_timeout(cred)))
+ goto out;
+ rcu_read_unlock();
+
+ new = auth->au_ops->lookup_cred(auth, &acred, 0);
+ if (new == cred) {
+ put_rpccred(new);
return true;
- ctx->ll_cred = cred;
- return !!(cred->cr_ops->crkey_timeout &&
- cred->cr_ops->crkey_timeout(cred));
+ }
+ if (IS_ERR_OR_NULL(new)) {
+ new = NULL;
+ ret = true;
+ } else if (new->cr_ops->crkey_timeout &&
+ new->cr_ops->crkey_timeout(new))
+ ret = true;
+
+ rcu_read_lock();
+ old = rcu_dereference_protected(xchg(&ctx->ll_cred,
+ RCU_INITIALIZER(new)), 1);
+out:
+ rcu_read_unlock();
+ put_rpccred(old);
+ return ret;
}
/*
@@ -1382,8 +1384,6 @@ int nfs_updatepage(struct file *file, struct page *page,
status = nfs_writepage_setup(ctx, page, offset, count);
if (status < 0)
nfs_set_pageerror(mapping);
- else
- __set_page_dirty_nobuffers(page);
out:
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
@@ -1671,10 +1671,13 @@ static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
atomic_inc(&cinfo->rpcs_out);
}
-static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
+bool nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
- if (atomic_dec_and_test(&cinfo->rpcs_out))
+ if (atomic_dec_and_test(&cinfo->rpcs_out)) {
wake_up_var(&cinfo->rpcs_out);
+ return true;
+ }
+ return false;
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1774,6 +1777,7 @@ void nfs_init_commit(struct nfs_commit_data *data,
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);
+ nfs_commit_begin(cinfo->mds);
}
EXPORT_SYMBOL_GPL(nfs_init_commit);
@@ -1820,7 +1824,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
- atomic_inc(&cinfo->mds->rpcs_out);
if (NFS_SERVER(inode)->nfs_client->cl_minorversion)
task_flags = RPC_TASK_MOVEABLE;
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
@@ -1835,9 +1838,6 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
{
struct nfs_commit_data *data = calldata;
- dprintk("NFS: %5u nfs_commit_done (status %d)\n",
- task->tk_pid, task->tk_status);
-
/* Call the NFS version-specific code */
NFS_PROTO(data->inode)->commit_done(task, data);
trace_nfs_commit_done(task, data);
@@ -1936,6 +1936,7 @@ static int __nfs_commit_inode(struct inode *inode, int how,
int may_wait = how & FLUSH_SYNC;
int ret, nscan;
+ how &= ~FLUSH_SYNC;
nfs_init_cinfo_from_inode(&cinfo, inode);
nfs_commit_begin(cinfo.mds);
for (;;) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 6e9ea4ee0f73..3d1d17256a91 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
select EXPORTFS_BLOCK_OPS
- select SCSI_COMMON
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c99dee99a3c1..e5c0982a381d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -9,9 +9,6 @@
#include <linux/pr.h>
#include <linux/nfsd/debug.h>
-#include <scsi/scsi_proto.h>
-#include <scsi/scsi_common.h>
-#include <scsi/scsi_request.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = {
#endif /* CONFIG_NFSD_BLOCKLAYOUT */
#ifdef CONFIG_NFSD_SCSILAYOUT
-static int nfsd4_scsi_identify_device(struct block_device *bdev,
- struct pnfs_block_volume *b)
-{
- struct request_queue *q = bdev->bd_disk->queue;
- struct request *rq;
- struct scsi_request *req;
- /*
- * The allocation length (passed in bytes 3 and 4 of the INQUIRY
- * command descriptor block) specifies the number of bytes that have
- * been allocated for the data-in buffer.
- * 252 is the highest one-byte value that is a multiple of 4.
- * 65532 is the highest two-byte value that is a multiple of 4.
- */
- size_t bufflen = 252, maxlen = 65532, len, id_len;
- u8 *buf, *d, type, assoc;
- int retries = 1, error;
-
- if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
- return -EINVAL;
-
-again:
- buf = kzalloc(bufflen, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq)) {
- error = -ENOMEM;
- goto out_free_buf;
- }
- req = scsi_req(rq);
-
- error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
- if (error)
- goto out_put_request;
-
- req->cmd[0] = INQUIRY;
- req->cmd[1] = 1;
- req->cmd[2] = 0x83;
- req->cmd[3] = bufflen >> 8;
- req->cmd[4] = bufflen & 0xff;
- req->cmd_len = COMMAND_SIZE(INQUIRY);
-
- blk_execute_rq(NULL, rq, 1);
- if (req->result) {
- pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- req->result);
- error = -EIO;
- goto out_put_request;
- }
-
- len = (buf[2] << 8) + buf[3] + 4;
- if (len > bufflen) {
- if (len <= maxlen && retries--) {
- blk_put_request(rq);
- kfree(buf);
- bufflen = len;
- goto again;
- }
- pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
- len);
- goto out_put_request;
- }
-
- d = buf + 4;
- for (d = buf + 4; d < buf + len; d += id_len + 4) {
- id_len = d[3];
- type = d[1] & 0xf;
- assoc = (d[1] >> 4) & 0x3;
-
- /*
- * We only care about a EUI-64 and NAA designator types
- * with LU association.
- */
- if (assoc != 0x00)
- continue;
- if (type != 0x02 && type != 0x03)
- continue;
- if (id_len != 8 && id_len != 12 && id_len != 16)
- continue;
-
- b->scsi.code_set = PS_CODE_SET_BINARY;
- b->scsi.designator_type = type == 0x02 ?
- PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
- b->scsi.designator_len = id_len;
- memcpy(b->scsi.designator, d + 4, id_len);
-
- /*
- * If we found a 8 or 12 byte descriptor continue on to
- * see if a 16 byte one is available. If we find a
- * 16 byte descriptor we're done.
- */
- if (id_len == 16)
- break;
- }
-
-out_put_request:
- blk_put_request(rq);
-out_free_buf:
- kfree(buf);
- return error;
-}
-
#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
/*
@@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
}
+static const u8 designator_types[] = {
+ PS_DESIGNATOR_EUI64,
+ PS_DESIGNATOR_NAA,
+};
+
+static int
+nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b)
+{
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(designator_types); i++) {
+ u8 type = designator_types[i];
+
+ ret = disk->fops->get_unique_id(disk, b->scsi.designator, type);
+ if (ret > 0) {
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type;
+ b->scsi.designator_len = ret;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
static int
nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct nfs4_client *clp,
@@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
const struct pr_ops *ops;
- int error;
+ int ret;
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
sizeof(struct pnfs_block_volume), GFP_KERNEL);
@@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
b->type = PNFS_BLOCK_VOLUME_SCSI;
b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
- error = nfsd4_scsi_identify_device(sb->s_bdev, b);
- if (error)
- return error;
+ ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b);
+ if (ret < 0)
+ goto out_free_dev;
+ ret = -EINVAL;
ops = sb->s_bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: device %s does not support PRs.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
- if (error) {
+ ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (ret) {
pr_err("pNFS: failed to register key for device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
- error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
- if (error) {
+ if (ret) {
pr_err("pNFS: failed to reserve device %s.\n",
sb->s_id);
- return -EINVAL;
+ goto out_free_dev;
}
return 0;
+
+out_free_dev:
+ kfree(dev);
+ return ret;
}
static __be32
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index be3c1aad50ea..fdf89fcf1a0c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -602,6 +602,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *name, u32 cookie)
{
+ if (WARN_ON_ONCE(!inode))
+ return 0;
+
trace_nfsd_file_fsnotify_handle_event(inode, mask);
/* Should be no marks on non-regular files */
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index db7ef07ae50c..2e2f1d5e9f62 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -61,7 +61,7 @@ nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
goto out_error;
fl->fh.size = fhp->fh_handle.fh_size;
- memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_raw, fl->fh.size);
/* Give whole file layout segments */
seg->offset = 0;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 606fa155c28a..46a7f9b813e5 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -35,7 +35,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp,
/* must initialize before using! but maxsize doesn't matter */
fh_init(&fh,0);
fh.fh_handle.fh_size = f->size;
- memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+ memcpy(&fh.fh_handle.fh_raw, f->data, f->size);
fh.fh_export = NULL;
access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ;
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4b43929c1f25..367551bddfc6 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -188,51 +188,51 @@ out:
* XDR decode functions
*/
-static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *argp = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &argp->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_setaclargs *argp = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &argp->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
- return 0;
+ return false;
if (argp->mask & ~NFS_ACL_MASK)
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
&argp->acl_access : NULL))
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
&argp->acl_default : NULL))
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_accessargs *args = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -240,9 +240,9 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
*/
/* GETACL */
-static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct inode *inode;
@@ -280,9 +280,9 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
}
/* ACCESS */
-static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_accessres *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 5dfe7644a517..35b2ebda14da 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -127,38 +127,38 @@ out:
* XDR decode functions
*/
-static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->mask) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_setaclargs *argp = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
- return 0;
+ return false;
if (argp->mask & ~NFS_ACL_MASK)
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
&argp->acl_access : NULL))
- return 0;
+ return false;
if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
&argp->acl_default : NULL))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -166,9 +166,9 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
*/
/* GETACL */
-static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct kvec *head = rqstp->rq_res.head;
@@ -178,14 +178,14 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
int w;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
inode = d_inode(dentry);
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
- return 0;
+ return false;
base = (char *)xdr->p - (char *)head->iov_base;
@@ -194,7 +194,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
if (!*(rqstp->rq_next_page++))
- return 0;
+ return false;
w -= PAGE_SIZE;
}
@@ -207,20 +207,20 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
resp->mask & NFS_DFACL,
NFS_ACL_DEFAULT);
if (n <= 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* SETACL */
-static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p)
+static bool
+nfs3svc_encode_setaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 17715a6c7a40..4418517f6f12 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -201,8 +201,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
- nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
- &argp->first, cnt);
+ nvecs = svc_fill_write_vector(rqstp, &argp->payload);
if (!nvecs) {
resp->status = nfserr_io;
goto out;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0a5ebc52e6a9..c3ac1b6aa3aa 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -92,7 +92,7 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
return false;
fh_init(fhp, NFS3_FHSIZE);
fhp->fh_handle.fh_size = size;
- memcpy(&fhp->fh_handle.fh_base, p, size);
+ memcpy(&fhp->fh_handle.fh_raw, p, size);
return true;
}
@@ -131,7 +131,7 @@ svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
*p++ = cpu_to_be32(size);
if (size)
p[XDR_QUADLEN(size) - 1] = 0;
- memcpy(p, &fhp->fh_handle.fh_base, size);
+ memcpy(p, &fhp->fh_handle.fh_raw, size);
return true;
}
@@ -556,19 +556,17 @@ void fill_post_wcc(struct svc_fh *fhp)
* XDR decode functions
*/
-int
-nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_fhandle *args = rqstp->rq_argp;
return svcxdr_decode_nfs_fh3(xdr, &args->fh);
}
-int
-nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_sattrargs *args = rqstp->rq_argp;
return svcxdr_decode_nfs_fh3(xdr, &args->fh) &&
@@ -576,96 +574,83 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
svcxdr_decode_sattrguard3(xdr, args);
}
-int
-nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_diropargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len);
}
-int
-nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_accessargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->access) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_writeargs *args = rqstp->rq_argp;
u32 max_blocksize = svc_max_payload(rqstp);
- struct kvec *head = rqstp->rq_arg.head;
- struct kvec *tail = rqstp->rq_arg.tail;
- size_t remaining;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->stable) < 0)
- return 0;
+ return false;
/* opaque data */
if (xdr_stream_decode_u32(xdr, &args->len) < 0)
- return 0;
+ return false;
/* request sanity */
if (args->count != args->len)
- return 0;
- remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
- remaining -= xdr_stream_pos(xdr);
- if (remaining < xdr_align_size(args->len))
- return 0;
+ return false;
if (args->count > max_blocksize) {
args->count = max_blocksize;
args->len = max_blocksize;
}
+ if (!xdr_stream_subsegment(xdr, &args->payload, args->count))
+ return false;
- args->first.iov_base = xdr->p;
- args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
-
- return 1;
+ return true;
}
-int
-nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_createargs *args = rqstp->rq_argp;
if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->createmode) < 0)
- return 0;
+ return false;
switch (args->createmode) {
case NFS3_CREATE_UNCHECKED:
case NFS3_CREATE_GUARDED:
@@ -673,18 +658,17 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
case NFS3_CREATE_EXCLUSIVE:
args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE);
if (!args->verf)
- return 0;
+ return false;
break;
default:
- return 0;
+ return false;
}
- return 1;
+ return true;
}
-int
-nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_createargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs3(xdr, &args->fh,
@@ -692,44 +676,42 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
}
-int
-nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_symlinkargs *args = rqstp->rq_argp;
struct kvec *head = rqstp->rq_arg.head;
struct kvec *tail = rqstp->rq_arg.tail;
size_t remaining;
if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
- return 0;
+ return false;
if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
- return 0;
+ return false;
/* request sanity */
remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
remaining -= xdr_stream_pos(xdr);
if (remaining < xdr_align_size(args->tlen))
- return 0;
+ return false;
args->first.iov_base = xdr->p;
args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
- return 1;
+ return true;
}
-int
-nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_mknodargs *args = rqstp->rq_argp;
if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->ftype) < 0)
- return 0;
+ return false;
switch (args->ftype) {
case NF3CHR:
case NF3BLK:
@@ -743,16 +725,15 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
/* Valid XDR but illegal file types */
break;
default:
- return 0;
+ return false;
}
- return 1;
+ return true;
}
-int
-nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_renameargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs3(xdr, &args->ffh,
@@ -761,10 +742,9 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_linkargs *args = rqstp->rq_argp;
return svcxdr_decode_nfs_fh3(xdr, &args->ffh) &&
@@ -772,62 +752,59 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readdirargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
- return 0;
+ return false;
args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
if (!args->verf)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readdirargs *args = rqstp->rq_argp;
u32 dircount;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
- return 0;
+ return false;
args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
if (!args->verf)
- return 0;
+ return false;
/* dircount is ignored */
if (xdr_stream_decode_u32(xdr, &dircount) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_commitargs *args = rqstp->rq_argp;
if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -835,30 +812,28 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
*/
/* GETATTR */
-int
-nfs3svc_encode_getattrres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime);
if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
/* SETATTR, REMOVE, RMDIR */
-int
-nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
@@ -866,174 +841,168 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
}
/* LOOKUP */
-int nfs3svc_encode_lookupres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_diropres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* ACCESS */
-int
-nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_accessres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->access) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* READLINK */
-int
-nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, resp->pages, 0, resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* READ */
-int
-nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_bool(xdr, resp->eof) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* WRITE */
-int
-nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_writeres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->committed) < 0)
- return 0;
+ return false;
if (!svcxdr_encode_writeverf3(xdr, resp->verf))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* CREATE, MKDIR, SYMLINK, MKNOD */
-int
-nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_diropres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* RENAME */
-int
-nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_renameres *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
@@ -1042,10 +1011,9 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
}
/* LINK */
-int
-nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_linkres *resp = rqstp->rq_resp;
return svcxdr_encode_nfsstat3(xdr, resp->status) &&
@@ -1054,34 +1022,33 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
}
/* READDIR */
-int
-nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
struct xdr_buf *dirlist = &resp->dirlist;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
- return 0;
+ return false;
xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static __be32
@@ -1308,27 +1275,26 @@ svcxdr_encode_fsstat3resok(struct xdr_stream *xdr,
}
/* FSSTAT */
-int
-nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_fsstatres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
if (!svcxdr_encode_fsstat3resok(xdr, resp))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static bool
@@ -1355,27 +1321,26 @@ svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr,
}
/* FSINFO */
-int
-nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_fsinfores *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
if (!svcxdr_encode_fsinfo3resok(xdr, resp))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static bool
@@ -1398,51 +1363,49 @@ svcxdr_encode_pathconf3resok(struct xdr_stream *xdr,
}
/* PATHCONF */
-int
-nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_pathconfres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
if (!svcxdr_encode_pathconf3resok(xdr, resp))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* COMMIT */
-int
-nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_commitres *resp = rqstp->rq_resp;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_writeverf3(xdr, resp->verf))
- return 0;
+ return false;
break;
default:
if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/*
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0f8b10f363e7..11f8715d92d6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -121,7 +121,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
BUG_ON(length > NFS4_FHSIZE);
p = xdr_reserve_space(xdr, 4 + length);
- xdr_encode_opaque(p, &fh->fh_base, length);
+ xdr_encode_opaque(p, &fh->fh_raw, length);
}
/*
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a97873f2d22b..6d1b5bb051c5 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
#ifdef CONFIG_NFSD_SCSILAYOUT
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
- sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
- blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+ sb->s_bdev &&
+ sb->s_bdev->bd_disk->fops->pr_ops &&
+ sb->s_bdev->bd_disk->fops->get_unique_id)
exp->ex_layout_types |= 1 << LAYOUT_SCSI;
#endif
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 486c5dba4b65..a36261f89bdf 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -519,7 +519,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fh_put(&cstate->current_fh);
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
- memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+ memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval,
putfh->pf_fhlen);
ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
@@ -1033,8 +1033,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_how_written = write->wr_stable_how;
- nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages,
- write->wr_payload.head, write->wr_buflen);
+ nvecs = svc_fill_write_vector(rqstp, &write->wr_payload);
WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
@@ -1178,7 +1177,7 @@ extern void nfs_sb_deactive(struct super_block *sb);
static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt)
{
- struct nfsd4_ssc_umount_item *ni = 0;
+ struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *work = NULL;
struct nfsd4_ssc_umount_item *tmp;
DEFINE_WAIT(wait);
@@ -1383,7 +1382,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
s_fh = &cstate->save_fh;
copy->c_fh.size = s_fh->fh_handle.fh_size;
- memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size);
+ memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_raw, copy->c_fh.size);
copy->stateid.seqid = cpu_to_be32(s_stid->si_generation);
memcpy(copy->stateid.other, (void *)&s_stid->si_opaque,
sizeof(stateid_opaque_t));
@@ -2462,11 +2461,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
__be32 status;
resp->xdr = &rqstp->rq_res_stream;
+ resp->statusp = resp->xdr->p;
/* reserve space for: NFS status code */
xdr_reserve_space(resp->xdr, XDR_UNIT);
- resp->tagp = resp->xdr->p;
/* reserve space for: taglen, tag, and opcnt */
xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen);
resp->taglen = args->taglen;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3f4027a5de88..bfad94c70b84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1010,7 +1010,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
}
spin_unlock(&blocked_delegations_lock);
}
- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
if (test_bit(hash&255, bd->set[0]) &&
test_bit((hash>>8)&255, bd->set[0]) &&
test_bit((hash>>16)&255, bd->set[0]))
@@ -1029,7 +1029,7 @@ static void block_delegations(struct knfsd_fh *fh)
u32 hash;
struct bloom_pair *bd = &blocked_delegations;
- hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
spin_lock(&blocked_delegations_lock);
__set_bit(hash&255, bd->set[bd->new]);
@@ -5541,7 +5541,7 @@ static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn)
static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
{
bool do_wakeup = false;
- struct nfsd4_ssc_umount_item *ni = 0;
+ struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *tmp;
spin_lock(&nn->nfsd_ssc_lock);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index cf030ebe2827..b2a1d969a172 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2322,7 +2322,7 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
return true;
}
-static int
+static bool
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
struct nfsd4_op *op;
@@ -2335,25 +2335,25 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
int i;
if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0)
- return 0;
+ return false;
max_reply += XDR_UNIT;
argp->tag = NULL;
if (unlikely(argp->taglen)) {
if (argp->taglen > NFSD4_MAX_TAGLEN)
- return 0;
+ return false;
p = xdr_inline_decode(argp->xdr, argp->taglen);
if (!p)
- return 0;
+ return false;
argp->tag = svcxdr_savemem(argp, p, argp->taglen);
if (!argp->tag)
- return 0;
+ return false;
max_reply += xdr_align_size(argp->taglen);
}
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
- return 0;
+ return false;
/*
* NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
@@ -2361,14 +2361,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* nfsd4_proc can handle this is an NFS-level error.
*/
if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- return 1;
+ return true;
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
if (!argp->ops) {
argp->ops = argp->iops;
dprintk("nfsd: couldn't allocate room for COMPOUND\n");
- return 0;
+ return false;
}
}
@@ -2380,7 +2380,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op->replay = NULL;
if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0)
- return 0;
+ return false;
if (nfsd4_opnum_in_range(argp, op)) {
op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
if (op->status != nfs_ok)
@@ -2427,7 +2427,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
- return 1;
+ return true;
}
static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
@@ -3110,7 +3110,7 @@ out_acl:
p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
if (!p)
goto out_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
@@ -3670,7 +3670,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
p = xdr_reserve_space(xdr, len + 4);
if (!p)
return nfserr_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
return 0;
}
@@ -5414,40 +5414,46 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
}
}
-int
-nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd4_compoundargs *args = rqstp->rq_argp;
/* svcxdr_tmp_alloc */
args->to_free = NULL;
- args->xdr = &rqstp->rq_arg_stream;
+ args->xdr = xdr;
args->ops = args->iops;
args->rqstp = rqstp;
return nfsd4_decode_compound(args);
}
-int
-nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct xdr_buf *buf = resp->xdr->buf;
+ struct xdr_buf *buf = xdr->buf;
+ __be32 *p;
WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
buf->tail[0].iov_len);
- *p = resp->cstate.status;
+ /*
+ * Send buffer space for the following items is reserved
+ * at the top of nfsd4_proc_compound().
+ */
+ p = resp->statusp;
+
+ *p++ = resp->cstate.status;
- rqstp->rq_next_page = resp->xdr->page_ptr + 1;
+ rqstp->rq_next_page = xdr->page_ptr + 1;
- p = resp->tagp;
*p++ = htonl(resp->taglen);
memcpy(p, resp->tag, resp->taglen);
p += XDR_QUADLEN(resp->taglen);
*p++ = htonl(resp->opcnt);
nfsd4_sequence_done(resp);
- return 1;
+ return true;
}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 96cdf77925f3..6e0b6f3148dc 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -241,8 +241,8 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
list_move_tail(&rp->c_lru, &b->lru_head);
}
-static long
-prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
+ unsigned int max)
{
struct svc_cacherep *rp, *tmp;
long freed = 0;
@@ -258,11 +258,17 @@ prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(b, rp, nn);
- freed++;
+ if (max && freed++ > max)
+ break;
}
return freed;
}
+static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+{
+ return prune_bucket(b, nn, 3);
+}
+
/*
* Walk the LRU list and prune off entries that are older than RC_EXPIRE.
* Also prune the oldest ones when the total exceeds the max number of entries.
@@ -279,7 +285,7 @@ prune_cache_entries(struct nfsd_net *nn)
if (list_empty(&b->lru_head))
continue;
spin_lock(&b->cache_lock);
- freed += prune_bucket(b, nn);
+ freed += prune_bucket(b, nn, 0);
spin_unlock(&b->cache_lock);
}
return freed;
@@ -453,8 +459,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
atomic_inc(&nn->num_drc_entries);
nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
- /* go ahead and prune the cache */
- prune_bucket(b, nn);
+ nfsd_prune_bucket(b, nn);
out_unlock:
spin_unlock(&b->cache_lock);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 070e5dd03e26..af8531c3854a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -395,12 +395,12 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
auth_domain_put(dom);
if (len)
return len;
-
+
mesg = buf;
len = SIMPLE_TRANSACTION_LIMIT;
- qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+ qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size);
mesg[-1] = '\n';
- return mesg - buf;
+ return mesg - buf;
}
/*
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 9664303afdaf..498e5a489826 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -78,8 +78,10 @@ extern const struct seq_operations nfs_exports_op;
*/
struct nfsd_voidargs { };
struct nfsd_voidres { };
-int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p);
-int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p);
+bool nfssvc_decode_voidarg(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr);
+bool nfssvc_encode_voidres(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr);
/*
* Function prototypes.
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c475d2271f9c..f3779fa72c89 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -154,11 +154,12 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
- struct fid *fid = NULL, sfid;
+ struct fid *fid = NULL;
struct svc_export *exp;
struct dentry *dentry;
int fileid_type;
int data_left = fh->fh_size/4;
+ int len;
__be32 error;
error = nfserr_stale;
@@ -167,48 +168,35 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers == 4 && fh->fh_size == 0)
return nfserr_nofilehandle;
- if (fh->fh_version == 1) {
- int len;
-
- if (--data_left < 0)
- return error;
- if (fh->fh_auth_type != 0)
- return error;
- len = key_len(fh->fh_fsid_type) / 4;
- if (len == 0)
- return error;
- if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
- /* deprecated, convert to type 3 */
- len = key_len(FSID_ENCODE_DEV)/4;
- fh->fh_fsid_type = FSID_ENCODE_DEV;
- /*
- * struct knfsd_fh uses host-endian fields, which are
- * sometimes used to hold net-endian values. This
- * confuses sparse, so we must use __force here to
- * keep it from complaining.
- */
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
- ntohl((__force __be32)fh->fh_fsid[1])));
- fh->fh_fsid[1] = fh->fh_fsid[2];
- }
- data_left -= len;
- if (data_left < 0)
- return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
- fid = (struct fid *)(fh->fh_fsid + len);
- } else {
- __u32 tfh[2];
- dev_t xdev;
- ino_t xino;
-
- if (fh->fh_size != NFS_FHSIZE)
- return error;
- /* assume old filehandle format */
- xdev = old_decode_dev(fh->ofh_xdev);
- xino = u32_to_ino_t(fh->ofh_xino);
- mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
- exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+ if (fh->fh_version != 1)
+ return error;
+
+ if (--data_left < 0)
+ return error;
+ if (fh->fh_auth_type != 0)
+ return error;
+ len = key_len(fh->fh_fsid_type) / 4;
+ if (len == 0)
+ return error;
+ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ /* deprecated, convert to type 3 */
+ len = key_len(FSID_ENCODE_DEV)/4;
+ fh->fh_fsid_type = FSID_ENCODE_DEV;
+ /*
+ * struct knfsd_fh uses host-endian fields, which are
+ * sometimes used to hold net-endian values. This
+ * confuses sparse, so we must use __force here to
+ * keep it from complaining.
+ */
+ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+ ntohl((__force __be32)fh->fh_fsid[1])));
+ fh->fh_fsid[1] = fh->fh_fsid[2];
}
+ data_left -= len;
+ if (data_left < 0)
+ return error;
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);
error = nfserr_stale;
if (IS_ERR(exp)) {
@@ -253,18 +241,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rqstp->rq_vers > 2)
error = nfserr_badhandle;
- if (fh->fh_version != 1) {
- sfid.i32.ino = fh->ofh_ino;
- sfid.i32.gen = fh->ofh_generation;
- sfid.i32.parent_ino = fh->ofh_dirino;
- fid = &sfid;
- data_left = 3;
- if (fh->ofh_dirino == 0)
- fileid_type = FILEID_INO32_GEN;
- else
- fileid_type = FILEID_INO32_GEN_PARENT;
- } else
- fileid_type = fh->fh_fileid_type;
+ fileid_type = fh->fh_fileid_type;
if (fileid_type == FILEID_ROOT)
dentry = dget(exp->ex_path.dentry);
@@ -452,20 +429,6 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
}
}
-/*
- * for composing old style file handles
- */
-static inline void _fh_update_old(struct dentry *dentry,
- struct svc_export *exp,
- struct knfsd_fh *fh)
-{
- fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
- fh->ofh_generation = d_inode(dentry)->i_generation;
- if (d_is_dir(dentry) ||
- (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
- fh->ofh_dirino = 0;
-}
-
static bool is_root_export(struct svc_export *exp)
{
return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
@@ -562,9 +525,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
/* ref_fh is a reference file handle.
* if it is non-null and for the same filesystem, then we should compose
* a filehandle which is of the same version, where possible.
- * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
- * Then create a 32byte filehandle using nfs_fhbase_old
- *
*/
struct inode * inode = d_inode(dentry);
@@ -600,35 +560,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_dentry = dget(dentry); /* our internal copy */
fhp->fh_export = exp_get(exp);
- if (fhp->fh_handle.fh_version == 0xca) {
- /* old style filehandle please */
- memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
- fhp->fh_handle.fh_size = NFS_FHSIZE;
- fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
- fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
- fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
- fhp->fh_handle.ofh_xino =
- ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
- fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
- if (inode)
- _fh_update_old(dentry, exp, &fhp->fh_handle);
- } else {
- fhp->fh_handle.fh_size =
- key_len(fhp->fh_handle.fh_fsid_type) + 4;
- fhp->fh_handle.fh_auth_type = 0;
-
- mk_fsid(fhp->fh_handle.fh_fsid_type,
- fhp->fh_handle.fh_fsid,
- ex_dev,
- d_inode(exp->ex_path.dentry)->i_ino,
- exp->ex_fsid, exp->ex_uuid);
-
- if (inode)
- _fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
- fh_put(fhp);
- return nfserr_opnotsupp;
- }
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
+ fhp->fh_handle.fh_auth_type = 0;
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
+ d_inode(exp->ex_path.dentry)->i_ino,
+ exp->ex_fsid, exp->ex_uuid);
+
+ if (inode)
+ _fh_update(fhp, exp, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+ fh_put(fhp);
+ return nfserr_opnotsupp;
}
return 0;
@@ -649,16 +595,12 @@ fh_update(struct svc_fh *fhp)
dentry = fhp->fh_dentry;
if (d_really_is_negative(dentry))
goto out_negative;
- if (fhp->fh_handle.fh_version != 1) {
- _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
- } else {
- if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
- return 0;
+ if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+ return 0;
- _fh_update(fhp, fhp->fh_export, dentry);
- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
- return nfserr_opnotsupp;
- }
+ _fh_update(fhp, fhp->fh_export, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+ return nfserr_opnotsupp;
return 0;
out_bad:
printk(KERN_ERR "fh_update: fh not verified!\n");
@@ -698,16 +640,11 @@ fh_put(struct svc_fh *fhp)
char * SVCFH_fmt(struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
+ static char buf[2+1+1+64*3+1];
- static char buf[80];
- sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
- fh->fh_size,
- fh->fh_base.fh_pad[0],
- fh->fh_base.fh_pad[1],
- fh->fh_base.fh_pad[2],
- fh->fh_base.fh_pad[3],
- fh->fh_base.fh_pad[4],
- fh->fh_base.fh_pad[5]);
+ if (fh->fh_size < 0 || fh->fh_size> 64)
+ return "bad-fh";
+ sprintf(buf, "%d: %*ph", fh->fh_size, fh->fh_size, fh->fh_raw);
return buf;
}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 6106697adc04..d11e4b6870d6 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -10,9 +10,56 @@
#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
-#include <uapi/linux/nfsd/nfsfh.h>
#include <linux/iversion.h>
#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+/*
+ * The file handle starts with a sequence of four-byte words.
+ * The first word contains a version number (1) and three descriptor bytes
+ * that tell how the remaining 3 variable length fields should be handled.
+ * These three bytes are auth_type, fsid_type and fileid_type.
+ *
+ * All four-byte values are in host-byte-order.
+ *
+ * The auth_type field is deprecated and must be set to 0.
+ *
+ * The fsid_type identifies how the filesystem (or export point) is
+ * encoded.
+ * Current values:
+ * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
+ * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
+ * says we mustn't. We must break it up and reassemble.
+ * 1 - 4 byte user specified identifier
+ * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
+ * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
+ * 4 - 4 byte inode number and 4 byte uuid
+ * 5 - 8 byte uuid
+ * 6 - 16 byte uuid
+ * 7 - 8 byte inode number and 16 byte uuid
+ *
+ * The fileid_type identifies how the file within the filesystem is encoded.
+ * The values for this field are filesystem specific, exccept that
+ * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
+ * in include/linux/exportfs.h for currently registered values.
+ */
+
+struct knfsd_fh {
+ unsigned int fh_size; /*
+ * Points to the current size while
+ * building a new file handle.
+ */
+ union {
+ char fh_raw[NFS4_FHSIZE];
+ struct {
+ u8 fh_version; /* == 1 */
+ u8 fh_auth_type; /* deprecated */
+ u8 fh_fsid_type;
+ u8 fh_fileid_type;
+ u32 fh_fsid[]; /* flexible-array member */
+ };
+ };
+};
static inline __u32 ino_t_to_u32(ino_t ino)
{
@@ -188,7 +235,7 @@ static inline void
fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
- memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+ memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
}
static __inline__ struct svc_fh *
@@ -203,7 +250,7 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
- if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0)
return false;
return true;
}
@@ -227,7 +274,7 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
*/
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
- return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+ return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
}
#else
static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 90fcd6178823..eea5b59b6a6c 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -234,8 +234,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
- nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
- &argp->first, cnt);
+ nvecs = svc_fill_write_vector(rqstp, &argp->payload);
if (!nvecs) {
resp->status = nfserr_io;
goto out;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ccb59e91011b..80431921e5d7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1004,9 +1004,6 @@ out:
int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
const struct svc_procedure *proc = rqstp->rq_procinfo;
- struct kvec *argv = &rqstp->rq_arg.head[0];
- struct kvec *resv = &rqstp->rq_res.head[0];
- __be32 *p;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -1015,7 +1012,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
rqstp->rq_cachetype = proc->pc_cachetype;
svcxdr_init_decode(rqstp);
- if (!proc->pc_decode(rqstp, argv->iov_base))
+ if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
switch (nfsd_cache_lookup(rqstp)) {
@@ -1031,14 +1028,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
* Need to grab the location to store the status, as
* NFSv4 does some encoding while processing
*/
- p = resv->iov_base + resv->iov_len;
svcxdr_init_encode(rqstp);
*statp = proc->pc_func(rqstp);
if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
- if (!proc->pc_encode(rqstp, p))
+ if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
@@ -1065,29 +1061,29 @@ out_encode_err:
/**
* nfssvc_decode_voidarg - Decode void arguments
* @rqstp: Server RPC transaction context
- * @p: buffer containing arguments to decode
+ * @xdr: XDR stream positioned at arguments to decode
*
* Return values:
- * %0: Arguments were not valid
- * %1: Decoding was successful
+ * %false: Arguments were not valid
+ * %true: Decoding was successful
*/
-int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
/**
* nfssvc_encode_voidres - Encode void results
* @rqstp: Server RPC transaction context
- * @p: buffer in which to encode results
+ * @xdr: XDR stream into which to encode results
*
* Return values:
- * %0: Local error while encoding
- * %1: Encoding was successful
+ * %false: Local error while encoding
+ * %true: Encoding was successful
*/
-int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- return 1;
+ return true;
}
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index a06c05fe3b42..aba8520b4b8b 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -64,7 +64,7 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
if (!p)
return false;
fh_init(fhp, NFS_FHSIZE);
- memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+ memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;
return true;
@@ -78,7 +78,7 @@ svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
p = xdr_reserve_space(xdr, NFS_FHSIZE);
if (!p)
return false;
- memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+ memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE);
return true;
}
@@ -272,94 +272,81 @@ svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
* XDR decode functions
*/
-int
-nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_fhandle *args = rqstp->rq_argp;
return svcxdr_decode_fhandle(xdr, &args->fh);
}
-int
-nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_sattrargs *args = rqstp->rq_argp;
return svcxdr_decode_fhandle(xdr, &args->fh) &&
svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
-int
-nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_diropargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len);
}
-int
-nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_readargs *args = rqstp->rq_argp;
u32 totalcount;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
/* totalcount is ignored */
if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
-int
-nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_writeargs *args = rqstp->rq_argp;
- struct kvec *head = rqstp->rq_arg.head;
- struct kvec *tail = rqstp->rq_arg.tail;
u32 beginoffset, totalcount;
- size_t remaining;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
/* beginoffset is ignored */
if (xdr_stream_decode_u32(xdr, &beginoffset) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
- return 0;
+ return false;
/* totalcount is ignored */
if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
- return 0;
+ return false;
/* opaque data */
if (xdr_stream_decode_u32(xdr, &args->len) < 0)
- return 0;
+ return false;
if (args->len > NFSSVC_MAXBLKSIZE_V2)
- return 0;
- remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
- remaining -= xdr_stream_pos(xdr);
- if (remaining < xdr_align_size(args->len))
- return 0;
- args->first.iov_base = xdr->p;
- args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
+ return false;
+ if (!xdr_stream_subsegment(xdr, &args->payload, args->len))
+ return false;
- return 1;
+ return true;
}
-int
-nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_createargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs(xdr, &args->fh,
@@ -367,10 +354,9 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
-int
-nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_renameargs *args = rqstp->rq_argp;
return svcxdr_decode_diropargs(xdr, &args->ffh,
@@ -379,10 +365,9 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_linkargs *args = rqstp->rq_argp;
return svcxdr_decode_fhandle(xdr, &args->ffh) &&
@@ -390,178 +375,170 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
&args->tname, &args->tlen);
}
-int
-nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_symlinkargs *args = rqstp->rq_argp;
struct kvec *head = rqstp->rq_arg.head;
if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
- return 0;
+ return false;
if (args->tlen == 0)
- return 0;
+ return false;
args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
if (!args->first.iov_base)
- return 0;
+ return false;
return svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
-int
-nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_readdirargs *args = rqstp->rq_argp;
if (!svcxdr_decode_fhandle(xdr, &args->fh))
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->cookie) < 0)
- return 0;
+ return false;
if (xdr_stream_decode_u32(xdr, &args->count) < 0)
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
* XDR encode functions
*/
-int
-nfssvc_encode_statres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_stat *resp = rqstp->rq_resp;
return svcxdr_encode_stat(xdr, resp->status);
}
-int
-nfssvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_attrstat *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_diropres *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fhandle(xdr, &resp->fh))
- return 0;
+ return false;
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (xdr_stream_encode_u32(xdr, resp->len) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, &resp->page, 0, resp->len);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->count) < 0)
- return 0;
+ return false;
xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
resp->count);
if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readdirres *resp = rqstp->rq_resp;
struct xdr_buf *dirlist = &resp->dirlist;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
/* no more entries */
if (xdr_stream_encode_item_absent(xdr) < 0)
- return 0;
+ return false;
if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
-int
-nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
+bool
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
- struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_statfsres *resp = rqstp->rq_resp;
struct kstatfs *stat = &resp->stats;
+ __be32 *p;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
p = xdr_reserve_space(xdr, XDR_UNIT * 5);
if (!p)
- return 0;
+ return false;
*p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2);
*p++ = cpu_to_be32(stat->f_bsize);
*p++ = cpu_to_be32(stat->f_blocks);
@@ -570,7 +547,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
break;
}
- return 1;
+ return true;
}
/**
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 538520957a81..f1e0d3c51bc2 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,6 +9,7 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+
#include "export.h"
#include "nfsfh.h"
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 738d564ca4ce..c99857689e2c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -244,6 +244,7 @@ out_nfserr:
* returned. Otherwise the covered directory is returned.
* NOTE: this mountpoint crossing is not supported properly by all
* clients and is explicitly disallowed for NFSv3
+ * NeilBrown <neilb@cse.unsw.edu.au>
*/
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
@@ -729,9 +730,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
path.dentry = fhp->fh_dentry;
inode = d_inode(path.dentry);
- /* Disallow write access to files with the append-only bit set
- * or any access when mandatory locking enabled
- */
err = nfserr_perm;
if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
@@ -1410,7 +1408,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (nfsd_create_is_exclusive(createmode)) {
/* solaris7 gets confused (bugid 4218508) if these have
- * the high bit set, so just clear the high bits. If this is
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits. If this is
* ever changed to use different attrs for storing the
* verifier, then do_open_lookup() will also need to be fixed
* accordingly.
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index f45b4bc93f52..528fb299430e 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -33,7 +33,7 @@ struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
int len;
- struct kvec first;
+ struct xdr_buf payload;
};
struct nfsd_createargs {
@@ -141,23 +141,24 @@ union nfsd_xdrstore {
#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
-int nfssvc_decode_fhandleargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_readargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_createargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
-int nfssvc_encode_statres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_attrstatres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_readres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *);
+bool nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+bool nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset);
int nfssvc_encode_entry(void *data, const char *name, int namlen,
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 933008382bbe..03fe4e21306c 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -40,7 +40,7 @@ struct nfsd3_writeargs {
__u32 count;
int stable;
__u32 len;
- struct kvec first;
+ struct xdr_buf payload;
};
struct nfsd3_createargs {
@@ -265,36 +265,37 @@ union nfsd3_xdrstore {
#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
-int nfs3svc_decode_fhandleargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_getattrres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_lookupres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_readres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_createres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *);
+bool nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+bool nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
void nfs3svc_release_fhandle(struct svc_rqst *);
void nfs3svc_release_fhandle2(struct svc_rqst *);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3e4052e3bd50..846ab6df9d48 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -702,10 +702,11 @@ struct nfsd4_compoundres {
struct xdr_stream *xdr;
struct svc_rqst * rqstp;
+ __be32 *statusp;
u32 taglen;
char * tag;
u32 opcnt;
- __be32 * tagp; /* tag, opcount encode location */
+
struct nfsd4_compound_state cstate;
};
@@ -756,8 +757,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
-int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *);
-int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *);
+bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index adf3bb0a8048..6ce8617b562d 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * alloc.c - NILFS dat/inode allocator
+ * NILFS dat/inode allocator
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 0303c3968cee..b667e869ac07 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ * Persistent object (dat entry/disk inode) allocator/deallocator
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 5900879d5693..798a2c1b38c6 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * bmap.c - NILFS block mapping.
+ * NILFS block mapping.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 2c63858e81c9..608168a5cb88 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * bmap.h - NILFS block mapping.
+ * NILFS block mapping.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4391fd3abd8f..66bdaa2cf496 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * btnode.c - NILFS B-tree node cache
+ * NILFS B-tree node cache
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 0f88dbc9bcb3..11663650add7 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * btnode.h - NILFS B-tree node cache
+ * NILFS B-tree node cache
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ab9ec073330f..3594eabe1419 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * btree.c - NILFS B-tree.
+ * NILFS B-tree.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index d1421b646ce4..92868e1a48ca 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * btree.h - NILFS B-tree.
+ * NILFS B-tree.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index ce144776b4ef..9ebefb3acb0e 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * cpfile.c - NILFS checkpoint file.
+ * NILFS checkpoint file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 6336222df24a..edabb2dc5756 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * cpfile.h - NILFS checkpoint file.
+ * NILFS checkpoint file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8bccdf1158fc..dc51d3b7a7bf 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * dat.c - NILFS disk address translation.
+ * NILFS disk address translation.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index b17ee34580ae..468c82d26183 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * dat.h - NILFS disk address translation.
+ * NILFS disk address translation.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 81394e22d0a0..f8f4c2ff52f4 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * dir.c - NILFS directory entry operations
+ * NILFS directory entry operations
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index f353101955e3..a35f2795b242 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * direct.c - NILFS direct block pointer.
+ * NILFS direct block pointer.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index ec9a23c77994..b7ca896269af 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * direct.h - NILFS direct block pointer.
+ * NILFS direct block pointer.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7cf765258fda..a265d391ffe9 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * file.c - NILFS regular file handling primitives including fsync().
+ * NILFS regular file handling primitives including fsync().
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 448320496856..a8f5315f01e3 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ * Dummy inodes to buffer blocks for garbage collection
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 02727ed3a7c6..a8a4bc8490b4 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * ifile.c - NILFS inode file
+ * NILFS inode file
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index a1e1e5711a05..35c5273f4821 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * ifile.h - NILFS inode file
+ * NILFS inode file
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2e8eb263cf0f..e3d807d5b83a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * inode.c - NILFS inode operations.
+ * NILFS inode operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 640ac8fe891e..fec194a666f4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * ioctl.c - NILFS ioctl operations.
+ * NILFS ioctl operations.
*
* Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
*
@@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
goto out;
ret = -ERANGE;
- if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+ if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev))
goto out;
segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 97769fe4d588..4b3d33cf0041 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * mdt.c - meta data file for NILFS
+ * Meta data file for NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index e77aea4bb921..8f86080a436d 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * mdt.h - NILFS meta data file prototype and definitions
+ * NILFS meta data file prototype and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 91eebeb0c48b..23899e0ae850 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * namei.c - NILFS pathname lookup operations.
+ * NILFS pathname lookup operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 60b21b6eeac0..a7b81755c350 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * nilfs.h - NILFS local header file.
+ * NILFS local header file.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 171fb5cd427f..bc3e2cd4117f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * page.c - buffer/page management specific to NILFS
+ * Buffer/page management specific to NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 62b9bb469e92..569263b23c0c 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * page.h - buffer/page management specific to NILFS
+ * Buffer/page management specific to NILFS
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 2217f904a7cf..9e2ed76c0f25 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * recovery.c - NILFS recovery logic
+ * NILFS recovery logic
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 56872e93823d..43287b0d3e9b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * segbuf.c - NILFS segment buffer
+ * NILFS segment buffer
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 9bea1bd59041..e20091ededba 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * segbuf.h - NILFS Segment buffer prototypes and definitions
+ * NILFS Segment buffer prototypes and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 686c8ee7b29c..85a853334771 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * segment.c - NILFS segment constructor.
+ * NILFS segment constructor.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index f5cf5308f3fc..1060f72ebf5a 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * segment.h - NILFS Segment constructor prototypes and definitions
+ * NILFS Segment constructor prototypes and definitions
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 63722475e17e..e385cca2004a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * sufile.c - NILFS segment usage file.
+ * NILFS segment usage file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index c4e2c7a7add1..8e8a1a5a0402 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * sufile.h - NILFS segment usage file.
+ * NILFS segment usage file.
*
* Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f6b2d280aab5..63e5fa74016c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * super.c - NILFS module and super block management.
+ * NILFS module and super block management.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
int ret;
ret = -ERANGE;
- devsize = i_size_read(sb->s_bdev->bd_inode);
+ devsize = bdev_nr_bytes(sb->s_bdev);
if (newsize > devsize)
goto out;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 62f8a7ac19c8..81f35c5b5a40 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * sysfs.c - sysfs support implementation.
+ * Sysfs support implementation.
*
* Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
* Copyright (C) 2014 HGST, Inc., a Western Digital Company.
@@ -95,7 +95,7 @@ static ssize_t
nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)atomic64_read(&root->inodes_count));
}
@@ -103,7 +103,7 @@ static ssize_t
nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)atomic64_read(&root->blocks_count));
}
@@ -116,7 +116,7 @@ static ssize_t
nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
struct nilfs_root *root, char *buf)
{
- return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
+ return sysfs_emit(buf, snapshot_readme_str);
}
NILFS_SNAPSHOT_RO_ATTR(inodes_count);
@@ -217,7 +217,7 @@ static ssize_t
nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
+ return sysfs_emit(buf, mounted_snapshots_readme_str);
}
NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
@@ -255,7 +255,7 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
ncheckpoints = cpstat.cs_ncps;
- return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
+ return sysfs_emit(buf, "%llu\n", ncheckpoints);
}
static ssize_t
@@ -278,7 +278,7 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
nsnapshots = cpstat.cs_nsss;
- return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
+ return sysfs_emit(buf, "%llu\n", nsnapshots);
}
static ssize_t
@@ -292,7 +292,7 @@ nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
last_cno = nilfs->ns_last_cno;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+ return sysfs_emit(buf, "%llu\n", last_cno);
}
static ssize_t
@@ -306,7 +306,7 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
cno = nilfs->ns_cno;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+ return sysfs_emit(buf, "%llu\n", cno);
}
static const char checkpoints_readme_str[] =
@@ -322,7 +322,7 @@ static ssize_t
nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
+ return sysfs_emit(buf, checkpoints_readme_str);
}
NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
@@ -353,7 +353,7 @@ nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
+ return sysfs_emit(buf, "%lu\n", nilfs->ns_nsegments);
}
static ssize_t
@@ -361,7 +361,7 @@ nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
+ return sysfs_emit(buf, "%lu\n", nilfs->ns_blocks_per_segment);
}
static ssize_t
@@ -375,7 +375,7 @@ nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
- return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
+ return sysfs_emit(buf, "%lu\n", ncleansegs);
}
static ssize_t
@@ -395,7 +395,7 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
return err;
}
- return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
+ return sysfs_emit(buf, "%llu\n", sustat.ss_ndirtysegs);
}
static const char segments_readme_str[] =
@@ -411,7 +411,7 @@ nilfs_segments_README_show(struct nilfs_segments_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, segments_readme_str);
+ return sysfs_emit(buf, segments_readme_str);
}
NILFS_SEGMENTS_RO_ATTR(segments_number);
@@ -448,7 +448,7 @@ nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
last_pseg = nilfs->ns_last_pseg;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)last_pseg);
}
@@ -463,7 +463,7 @@ nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
last_seq = nilfs->ns_last_seq;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
+ return sysfs_emit(buf, "%llu\n", last_seq);
}
static ssize_t
@@ -477,7 +477,7 @@ nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
last_cno = nilfs->ns_last_cno;
spin_unlock(&nilfs->ns_last_segment_lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+ return sysfs_emit(buf, "%llu\n", last_cno);
}
static ssize_t
@@ -491,7 +491,7 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
seg_seq = nilfs->ns_seg_seq;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
+ return sysfs_emit(buf, "%llu\n", seg_seq);
}
static ssize_t
@@ -505,7 +505,7 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
segnum = nilfs->ns_segnum;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
+ return sysfs_emit(buf, "%llu\n", segnum);
}
static ssize_t
@@ -519,7 +519,7 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
nextnum = nilfs->ns_nextnum;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
+ return sysfs_emit(buf, "%llu\n", nextnum);
}
static ssize_t
@@ -533,7 +533,7 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
pseg_offset = nilfs->ns_pseg_offset;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
+ return sysfs_emit(buf, "%lu\n", pseg_offset);
}
static ssize_t
@@ -547,7 +547,7 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
cno = nilfs->ns_cno;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+ return sysfs_emit(buf, "%llu\n", cno);
}
static ssize_t
@@ -575,7 +575,7 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
ctime = nilfs->ns_ctime;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", ctime);
+ return sysfs_emit(buf, "%llu\n", ctime);
}
static ssize_t
@@ -603,7 +603,7 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
nongc_ctime = nilfs->ns_nongc_ctime;
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime);
+ return sysfs_emit(buf, "%llu\n", nongc_ctime);
}
static ssize_t
@@ -617,7 +617,7 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
up_read(&nilfs->ns_segctor_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
+ return sysfs_emit(buf, "%u\n", ndirtyblks);
}
static const char segctor_readme_str[] =
@@ -654,7 +654,7 @@ static ssize_t
nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, segctor_readme_str);
+ return sysfs_emit(buf, segctor_readme_str);
}
NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
@@ -723,7 +723,7 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
sbwtime = nilfs->ns_sbwtime;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime);
+ return sysfs_emit(buf, "%llu\n", sbwtime);
}
static ssize_t
@@ -737,7 +737,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
sbwcount = nilfs->ns_sbwcount;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
+ return sysfs_emit(buf, "%u\n", sbwcount);
}
static ssize_t
@@ -751,7 +751,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
sb_update_freq = nilfs->ns_sb_update_freq;
up_read(&nilfs->ns_sem);
- return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
+ return sysfs_emit(buf, "%u\n", sb_update_freq);
}
static ssize_t
@@ -799,7 +799,7 @@ static ssize_t
nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
struct the_nilfs *nilfs, char *buf)
{
- return snprintf(buf, PAGE_SIZE, sb_readme_str);
+ return sysfs_emit(buf, sb_readme_str);
}
NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
@@ -834,7 +834,7 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
u32 major = le32_to_cpu(sbp[0]->s_rev_level);
u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
- return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
+ return sysfs_emit(buf, "%d.%d\n", major, minor);
}
static
@@ -842,7 +842,7 @@ ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
+ return sysfs_emit(buf, "%u\n", nilfs->ns_blocksize);
}
static
@@ -853,7 +853,7 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
struct nilfs_super_block **sbp = nilfs->ns_sbp;
u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
- return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
+ return sysfs_emit(buf, "%llu\n", dev_size);
}
static
@@ -864,7 +864,7 @@ ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
sector_t free_blocks = 0;
nilfs_count_free_blocks(nilfs, &free_blocks);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)free_blocks);
}
@@ -875,7 +875,7 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
- return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
+ return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid);
}
static
@@ -903,7 +903,7 @@ static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, dev_readme_str);
+ return sysfs_emit(buf, dev_readme_str);
}
NILFS_DEV_RO_ATTR(revision);
@@ -1047,7 +1047,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d.%d\n",
+ return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
}
@@ -1060,7 +1060,7 @@ static ssize_t nilfs_feature_README_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, features_readme_str);
+ return sysfs_emit(buf, features_readme_str);
}
NILFS_FEATURE_RO_ATTR(revision);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index d001eb862dae..78a87a016928 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * sysfs.h - sysfs support declarations.
+ * Sysfs support declarations.
*
* Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
* Copyright (C) 2014 HGST, Inc., a Western Digital Company.
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c8bfc01da5d7..dd48a8f74d57 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * the_nilfs.c - the_nilfs shared structure.
+ * the_nilfs shared structure.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
@@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct buffer_head **sbh = nilfs->ns_sbh;
- u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+ u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev));
int valid[2], swp = 0;
sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 987c8ab02aee..47c7dfbb7ea5 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * the_nilfs.h - the_nilfs shared structure.
+ * the_nilfs shared structure.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 057abd2cf887..b6091775aa6e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -111,6 +111,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
return fanotify_info_equal(info1, info2);
}
+static bool fanotify_error_event_equal(struct fanotify_error_event *fee1,
+ struct fanotify_error_event *fee2)
+{
+ /* Error events against the same file system are always merged. */
+ if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid))
+ return false;
+
+ return true;
+}
+
static bool fanotify_should_merge(struct fanotify_event *old,
struct fanotify_event *new)
{
@@ -141,6 +151,9 @@ static bool fanotify_should_merge(struct fanotify_event *old,
case FANOTIFY_EVENT_TYPE_FID_NAME:
return fanotify_name_event_equal(FANOTIFY_NE(old),
FANOTIFY_NE(new));
+ case FANOTIFY_EVENT_TYPE_FS_ERROR:
+ return fanotify_error_event_equal(FANOTIFY_EE(old),
+ FANOTIFY_EE(new));
default:
WARN_ON_ONCE(1);
}
@@ -176,6 +189,10 @@ static int fanotify_merge(struct fsnotify_group *group,
break;
if (fanotify_should_merge(old, new)) {
old->mask |= new->mask;
+
+ if (fanotify_is_error_event(old->mask))
+ FANOTIFY_EE(old)->err_count++;
+
return 1;
}
}
@@ -343,13 +360,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
static int fanotify_encode_fh_len(struct inode *inode)
{
int dwords = 0;
+ int fh_len;
if (!inode)
return 0;
exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ fh_len = dwords << 2;
+
+ /*
+ * struct fanotify_error_event might be preallocated and is
+ * limited to MAX_HANDLE_SZ. This should never happen, but
+ * safeguard by forcing an invalid file handle.
+ */
+ if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ))
+ return 0;
- return dwords << 2;
+ return fh_len;
}
/*
@@ -370,8 +397,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
fh->type = FILEID_ROOT;
fh->len = 0;
fh->flags = 0;
+
+ /*
+ * Invalid FHs are used by FAN_FS_ERROR for errors not
+ * linked to any inode. The f_handle won't be reported
+ * back to userspace.
+ */
if (!inode)
- return 0;
+ goto out;
/*
* !gpf means preallocated variable size fh, but fh_len could
@@ -403,8 +436,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
fh->type = type;
fh->len = fh_len;
- /* Mix fh into event merge key */
- *hash ^= fanotify_hash_fh(fh);
+out:
+ /*
+ * Mix fh into event merge key. Hash might be NULL in case of
+ * unhashed FID events (i.e. FAN_FS_ERROR).
+ */
+ if (hash)
+ *hash ^= fanotify_hash_fh(fh);
return FANOTIFY_FH_HDR_LEN + fh_len;
@@ -452,7 +490,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data,
if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
return dir;
- if (S_ISDIR(inode->i_mode))
+ if (inode && S_ISDIR(inode->i_mode))
return inode;
return dir;
@@ -563,6 +601,44 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
return &fne->fae;
}
+static struct fanotify_event *fanotify_alloc_error_event(
+ struct fsnotify_group *group,
+ __kernel_fsid_t *fsid,
+ const void *data, int data_type,
+ unsigned int *hash)
+{
+ struct fs_error_report *report =
+ fsnotify_data_error_report(data, data_type);
+ struct inode *inode;
+ struct fanotify_error_event *fee;
+ int fh_len;
+
+ if (WARN_ON_ONCE(!report))
+ return NULL;
+
+ fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS);
+ if (!fee)
+ return NULL;
+
+ fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
+ fee->error = report->error;
+ fee->err_count = 1;
+ fee->fsid = *fsid;
+
+ inode = report->inode;
+ fh_len = fanotify_encode_fh_len(inode);
+
+ /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */
+ if (!fh_len && inode)
+ inode = NULL;
+
+ fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0);
+
+ *hash ^= fanotify_hash_fsid(fsid);
+
+ return &fee->fae;
+}
+
static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
u32 mask, const void *data,
int data_type, struct inode *dir,
@@ -630,6 +706,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
+ } else if (fanotify_is_error_event(mask)) {
+ event = fanotify_alloc_error_event(group, fsid, data,
+ data_type, &hash);
} else if (name_event && (file_name || child)) {
event = fanotify_alloc_name_event(id, fsid, file_name, child,
&hash, gfp);
@@ -702,6 +781,9 @@ static void fanotify_insert_event(struct fsnotify_group *group,
assert_spin_locked(&group->notification_lock);
+ if (!fanotify_is_hashed_event(event->mask))
+ return;
+
pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
group, event, bucket);
@@ -738,8 +820,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
+ BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20);
mask = fanotify_group_event_mask(group, iter_info, mask, data,
data_type, dir);
@@ -778,9 +861,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
}
fsn_event = &event->fse;
- ret = fsnotify_add_event(group, fsn_event, fanotify_merge,
- fanotify_is_hashed_event(mask) ?
- fanotify_insert_event : NULL);
+ ret = fsnotify_insert_event(group, fsn_event, fanotify_merge,
+ fanotify_insert_event);
if (ret) {
/* Permission events shouldn't be merged */
BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
@@ -805,6 +887,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
if (group->fanotify_data.ucounts)
dec_ucount(group->fanotify_data.ucounts,
UCOUNT_FANOTIFY_GROUPS);
+
+ if (mempool_initialized(&group->fanotify_data.error_events_pool))
+ mempool_exit(&group->fanotify_data.error_events_pool);
}
static void fanotify_free_path_event(struct fanotify_event *event)
@@ -833,7 +918,16 @@ static void fanotify_free_name_event(struct fanotify_event *event)
kfree(FANOTIFY_NE(event));
}
-static void fanotify_free_event(struct fsnotify_event *fsn_event)
+static void fanotify_free_error_event(struct fsnotify_group *group,
+ struct fanotify_event *event)
+{
+ struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+ mempool_free(fee, &group->fanotify_data.error_events_pool);
+}
+
+static void fanotify_free_event(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event)
{
struct fanotify_event *event;
@@ -855,6 +949,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
case FANOTIFY_EVENT_TYPE_OVERFLOW:
kfree(event);
break;
+ case FANOTIFY_EVENT_TYPE_FS_ERROR:
+ fanotify_free_error_event(group, event);
+ break;
default:
WARN_ON_ONCE(1);
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4a5e555dc3d2..d25f500bf7e7 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -141,6 +141,7 @@ enum fanotify_event_type {
FANOTIFY_EVENT_TYPE_PATH,
FANOTIFY_EVENT_TYPE_PATH_PERM,
FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
+ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
__FANOTIFY_EVENT_TYPE_NUM
};
@@ -170,12 +171,18 @@ static inline void fanotify_init_event(struct fanotify_event *event,
event->pid = NULL;
}
+#define FANOTIFY_INLINE_FH(name, size) \
+struct { \
+ struct fanotify_fh (name); \
+ /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+ unsigned char _inline_fh_buf[(size)]; \
+}
+
struct fanotify_fid_event {
struct fanotify_event fae;
__kernel_fsid_t fsid;
- struct fanotify_fh object_fh;
- /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */
- unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN];
+
+ FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN);
};
static inline struct fanotify_fid_event *
@@ -196,12 +203,30 @@ FANOTIFY_NE(struct fanotify_event *event)
return container_of(event, struct fanotify_name_event, fae);
}
+struct fanotify_error_event {
+ struct fanotify_event fae;
+ s32 error; /* Error reported by the Filesystem. */
+ u32 err_count; /* Suppressed errors count */
+
+ __kernel_fsid_t fsid; /* FSID this error refers to. */
+
+ FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ);
+};
+
+static inline struct fanotify_error_event *
+FANOTIFY_EE(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_error_event, fae);
+}
+
static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_FID)
return &FANOTIFY_FE(event)->fsid;
else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
return &FANOTIFY_NE(event)->fsid;
+ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return &FANOTIFY_EE(event)->fsid;
else
return NULL;
}
@@ -213,6 +238,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh(
return &FANOTIFY_FE(event)->object_fh;
else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
return fanotify_info_file_fh(&FANOTIFY_NE(event)->info);
+ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return &FANOTIFY_EE(event)->object_fh;
else
return NULL;
}
@@ -244,6 +271,19 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event)
return info ? fanotify_info_dir_fh_len(info) : 0;
}
+static inline bool fanotify_event_has_object_fh(struct fanotify_event *event)
+{
+ /* For error events, even zeroed fh are reported. */
+ if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+ return true;
+ return fanotify_event_object_fh_len(event) > 0;
+}
+
+static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event)
+{
+ return fanotify_event_dir_fh_len(event) > 0;
+}
+
struct fanotify_path_event {
struct fanotify_event fae;
struct path path;
@@ -287,6 +327,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct fanotify_event, fse);
}
+static inline bool fanotify_is_error_event(u32 mask)
+{
+ return mask & FAN_FS_ERROR;
+}
+
static inline bool fanotify_event_has_path(struct fanotify_event *event)
{
return event->type == FANOTIFY_EVENT_TYPE_PATH ||
@@ -315,7 +360,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
*/
static inline bool fanotify_is_hashed_event(u32 mask)
{
- return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW);
+ return !(fanotify_is_perm_event(mask) ||
+ fsnotify_is_overflow_event(mask));
}
static inline unsigned int fanotify_event_hash_bucket(
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6facdf476255..559bc1e9926d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -30,6 +30,7 @@
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
#define FANOTIFY_DEFAULT_MAX_GROUPS 128
+#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32
/*
* Legacy fanotify marks limits (8192) is per group and we introduced a tunable
@@ -114,6 +115,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
#define FANOTIFY_PIDFD_INFO_HDR_LEN \
sizeof(struct fanotify_event_info_pidfd)
+#define FANOTIFY_ERROR_INFO_LEN \
+ (sizeof(struct fanotify_event_info_error))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -126,17 +129,26 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
FANOTIFY_EVENT_ALIGN);
}
-static int fanotify_event_info_len(unsigned int info_mode,
- struct fanotify_event *event)
+static size_t fanotify_event_len(unsigned int info_mode,
+ struct fanotify_event *event)
{
- struct fanotify_info *info = fanotify_event_info(event);
- int dir_fh_len = fanotify_event_dir_fh_len(event);
- int fh_len = fanotify_event_object_fh_len(event);
- int info_len = 0;
+ size_t event_len = FAN_EVENT_METADATA_LEN;
+ struct fanotify_info *info;
+ int dir_fh_len;
+ int fh_len;
int dot_len = 0;
- if (dir_fh_len) {
- info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
+ if (!info_mode)
+ return event_len;
+
+ if (fanotify_is_error_event(event->mask))
+ event_len += FANOTIFY_ERROR_INFO_LEN;
+
+ info = fanotify_event_info(event);
+
+ if (fanotify_event_has_dir_fh(event)) {
+ dir_fh_len = fanotify_event_dir_fh_len(event);
+ event_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
} else if ((info_mode & FAN_REPORT_NAME) &&
(event->mask & FAN_ONDIR)) {
/*
@@ -147,12 +159,14 @@ static int fanotify_event_info_len(unsigned int info_mode,
}
if (info_mode & FAN_REPORT_PIDFD)
- info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+ event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
- if (fh_len)
- info_len += fanotify_fid_info_len(fh_len, dot_len);
+ if (fanotify_event_has_object_fh(event)) {
+ fh_len = fanotify_event_object_fh_len(event);
+ event_len += fanotify_fid_info_len(fh_len, dot_len);
+ }
- return info_len;
+ return event_len;
}
/*
@@ -181,7 +195,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group,
static struct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- size_t event_size = FAN_EVENT_METADATA_LEN;
+ size_t event_size;
struct fanotify_event *event = NULL;
struct fsnotify_event *fsn_event;
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
@@ -194,8 +208,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
goto out;
event = FANOTIFY_E(fsn_event);
- if (info_mode)
- event_size += fanotify_event_info_len(info_mode, event);
+ event_size = fanotify_event_len(info_mode, event);
if (event_size > count) {
event = ERR_PTR(-EINVAL);
@@ -316,6 +329,28 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
+static size_t copy_error_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_event_info_error info;
+ struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
+ info.hdr.pad = 0;
+ info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
+
+ if (WARN_ON(count < info.hdr.len))
+ return -EFAULT;
+
+ info.error = fee->error;
+ info.error_count = fee->err_count;
+
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ return info.hdr.len;
+}
+
static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
int info_type, const char *name,
size_t name_len,
@@ -331,9 +366,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
__func__, fh_len, name_len, info_len, count);
- if (!fh_len)
- return 0;
-
if (WARN_ON_ONCE(len < sizeof(info) || len > count))
return -EFAULT;
@@ -368,6 +400,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
handle.handle_type = fh->type;
handle.handle_bytes = fh_len;
+
+ /* Mangle handle_type for bad file_handle */
+ if (!fh_len)
+ handle.handle_type = FILEID_INVALID;
+
if (copy_to_user(buf, &handle, sizeof(handle)))
return -EFAULT;
@@ -444,7 +481,7 @@ static int copy_info_records_to_user(struct fanotify_event *event,
/*
* Event info records order is as follows: dir fid + name, child fid.
*/
- if (fanotify_event_dir_fh_len(event)) {
+ if (fanotify_event_has_dir_fh(event)) {
info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
FAN_EVENT_INFO_TYPE_DFID;
ret = copy_fid_info_to_user(fanotify_event_fsid(event),
@@ -460,7 +497,7 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
- if (fanotify_event_object_fh_len(event)) {
+ if (fanotify_event_has_object_fh(event)) {
const char *dot = NULL;
int dot_len = 0;
@@ -520,6 +557,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_is_error_event(event->mask)) {
+ ret = copy_error_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -537,8 +583,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- metadata.event_len = FAN_EVENT_METADATA_LEN +
- fanotify_event_info_len(info_mode, event);
+ metadata.event_len = fanotify_event_len(info_mode, event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
@@ -1049,6 +1094,15 @@ out_dec_ucounts:
return ERR_PTR(ret);
}
+static int fanotify_group_init_error_pool(struct fsnotify_group *group)
+{
+ if (mempool_initialized(&group->fanotify_data.error_events_pool))
+ return 0;
+
+ return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
+ FANOTIFY_DEFAULT_FEE_POOL_SIZE,
+ sizeof(struct fanotify_error_event));
+}
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int type,
@@ -1057,6 +1111,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
{
struct fsnotify_mark *fsn_mark;
__u32 added;
+ int ret = 0;
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
@@ -1067,13 +1122,26 @@ static int fanotify_add_mark(struct fsnotify_group *group,
return PTR_ERR(fsn_mark);
}
}
+
+ /*
+ * Error events are pre-allocated per group, only if strictly
+ * needed (i.e. FAN_FS_ERROR was requested).
+ */
+ if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ ret = fanotify_group_init_error_pool(group);
+ if (ret)
+ goto out;
+ }
+
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
if (added & ~fsnotify_conn_mask(fsn_mark->connector))
fsnotify_recalc_mask(fsn_mark->connector);
+
+out:
mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
- return 0;
+ return ret;
}
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
@@ -1295,16 +1363,15 @@ out_destroy_group:
return fd;
}
-/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
{
__kernel_fsid_t root_fsid;
int err;
/*
- * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
*/
- err = vfs_get_fsid(path->dentry, fsid);
+ err = vfs_get_fsid(dentry, fsid);
if (err)
return err;
@@ -1312,10 +1379,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
return -ENODEV;
/*
- * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
* which uses a different fsid than sb root.
*/
- err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
if (err)
return err;
@@ -1323,6 +1390,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
root_fsid.val[1] != fsid->val[1])
return -EXDEV;
+ return 0;
+}
+
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct dentry *dentry)
+{
/*
* We need to make sure that the file system supports at least
* encoding a file handle so user can use name_to_handle_at() to
@@ -1330,8 +1403,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
* objects. However, name_to_handle_at() requires that the
* filesystem also supports decoding file handles.
*/
- if (!path->dentry->d_sb->s_export_op ||
- !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ if (!dentry->d_sb->s_export_op ||
+ !dentry->d_sb->s_export_op->fh_to_dentry)
return -EOPNOTSUPP;
return 0;
@@ -1447,15 +1520,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ if (mask & FAN_FS_ERROR &&
+ mark_type != FAN_MARK_FILESYSTEM)
+ goto fput_and_out;
+
/*
- * Events with data type inode do not carry enough information to report
- * event->fd, so we do not allow setting a mask for inode events unless
- * group supports reporting fid.
- * inode events are not supported on a mount mark, because they do not
- * carry enough information (i.e. path) to be filtered by mount point.
+ * Events that do not carry enough information to report
+ * event->fd require a group that supports reporting fid. Those
+ * events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount
+ * point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- if (mask & FANOTIFY_INODE_EVENTS &&
+ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
goto fput_and_out;
@@ -1482,7 +1559,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
if (fid_mode) {
- ret = fanotify_test_fid(&path, &__fsid);
+ ret = fanotify_test_fsid(path.dentry, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ ret = fanotify_test_fid(path.dentry);
if (ret)
goto path_put_and_out;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 963e6ce75b96..4034ca566f95 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -252,6 +252,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
if (WARN_ON_ONCE(!ops->handle_inode_event))
return 0;
+ if (WARN_ON_ONCE(!inode && !dir))
+ return 0;
+
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
return 0;
@@ -455,16 +458,16 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
* @file_name is relative to
* @file_name: optional file name associated with event
* @inode: optional inode associated with event -
- * either @dir or @inode must be non-NULL.
- * if both are non-NULL event may be reported to both.
+ * If @dir and @inode are both non-NULL, event may be
+ * reported to both.
* @cookie: inotify rename cookie
*/
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
const struct qstr *file_name, struct inode *inode, u32 cookie)
{
const struct path *path = fsnotify_data_path(data, data_type);
+ struct super_block *sb = fsnotify_data_sb(data, data_type);
struct fsnotify_iter_info iter_info = {};
- struct super_block *sb;
struct mount *mnt = NULL;
struct inode *parent = NULL;
int ret = 0;
@@ -483,7 +486,6 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
*/
parent = dir;
}
- sb = inode->i_sb;
/*
* Optimization: srcu_read_lock() has a memory barrier which can
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fb89c351295d..6a297efc4788 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
* that deliberately ignores overflow events.
*/
if (group->overflow_event)
- group->ops->free_event(group->overflow_event);
+ group->ops->free_event(group, group->overflow_event);
fsnotify_put_group(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d1a64daa0171..d92d7b0adc9a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
if (len)
strcpy(event->name, name->name);
- ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL);
+ ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
@@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
dec_inotify_instances(group->inotify_data.ucounts);
}
-static void inotify_free_event(struct fsnotify_event *fsn_event)
+static void inotify_free_event(struct fsnotify_group *group,
+ struct fsnotify_event *fsn_event)
{
kfree(INOTIFY_E(fsn_event));
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 62051247f6d2..29fca3284bb5 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -94,10 +94,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
__u32 mask;
/*
- * Everything should accept their own ignored and should receive events
- * when the inode is unmounted. All directories care about children.
+ * Everything should receive events when the inode is unmounted.
+ * All directories care about children.
*/
- mask = (FS_IN_IGNORED | FS_UNMOUNT);
+ mask = (FS_UNMOUNT);
if (S_ISDIR(inode->i_mode))
mask |= FS_EVENT_ON_CHILD;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 32f45543b9c6..9022ae650cf8 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
WARN_ON(!list_empty(&event->list));
spin_unlock(&group->notification_lock);
}
- group->ops->free_event(event);
+ group->ops->free_event(group, event);
}
/*
@@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
* 2 if the event was not queued - either the queue of events has overflown
* or the group is shutting down.
*/
-int fsnotify_add_event(struct fsnotify_group *group,
- struct fsnotify_event *event,
- int (*merge)(struct fsnotify_group *,
- struct fsnotify_event *),
- void (*insert)(struct fsnotify_group *,
- struct fsnotify_event *))
+int fsnotify_insert_event(struct fsnotify_group *group,
+ struct fsnotify_event *event,
+ int (*merge)(struct fsnotify_group *,
+ struct fsnotify_event *),
+ void (*insert)(struct fsnotify_group *,
+ struct fsnotify_event *))
{
int ret = 0;
struct list_head *list = &group->notification_list;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ab4f3362466d..2ae25e48a41a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -5,6 +5,7 @@
* Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*/
+#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
@@ -1829,7 +1830,7 @@ again:
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0d7e948cb29c..5ae8de09b271 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
ntfs_debug("Set device block size to %i bytes (block size bits %i).",
blocksize, sb->s_blocksize_bits);
/* Determine the size of the device in units of block_size bytes. */
- if (!i_size_read(sb->s_bdev->bd_inode)) {
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
+ if (!vol->nr_blocks) {
if (!silent)
ntfs_error(sb, "Unable to determine device size.");
goto err_out_now;
}
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
/* Read the boot sector and return unlocked buffer head to it. */
if (!(bh = read_ntfs_boot_sector(sb, silent))) {
if (!silent)
@@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
goto err_out_now;
}
BUG_ON(blocksize != sb->s_blocksize);
- vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
ntfs_debug("Changed device block size to %i bytes (block size "
"bits %i) to match volume sector size.",
blocksize, sb->s_blocksize_bits);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 43b1451bff53..787b53b984ee 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -8,6 +8,7 @@
*/
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/falloc.h>
@@ -989,7 +990,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
frame_vbo = pos & ~(frame_size - 1);
index = frame_vbo >> PAGE_SHIFT;
- if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
err = -EFAULT;
goto out;
}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 859951d785cb..a87ab3ad3cd3 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1046,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
if (!ret && i2)
ret = writeback_inode(i2);
if (!ret)
- ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+ ret = sync_blockdev_nowait(sb->s_bdev);
return ret;
}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index d41d76979e12..29813200c7af 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -921,7 +921,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Parse boot. */
err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
- bdev->bd_inode->i_size);
+ bdev_nr_bytes(bdev));
if (err)
goto out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5d9ae17bd443..bb247bc349e4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
data_alloc_bh, start_blk,
num_clusters);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
}
/*
- * Zero the area past i_size but still within an allocated
- * cluster. This avoids exposing nonzero data on subsequent file
- * extends.
+ * Zero partial cluster for a hole punch or truncate. This avoids exposing
+ * nonzero data on subsequent file extends.
*
* We need to call this before i_size is updated on the inode because
* otherwise block_write_full_page() will skip writeout of pages past
- * i_size. The new_i_size parameter is passed for this reason.
+ * i_size.
*/
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end)
@@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
return 0;
+ /*
+ * Avoid zeroing pages fully beyond current i_size. It is pointless as
+ * underlying blocks of those pages should be already zeroed out and
+ * page writeback will skip them anyway.
+ */
+ range_end = min_t(u64, range_end, i_size_read(inode));
+ if (range_start >= range_end)
+ return 0;
+
pages = kcalloc(ocfs2_pages_per_cluster(sb),
sizeof(struct page *), GFP_NOFS);
if (pages == NULL) {
@@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
goto out;
}
- if (range_start == range_end)
- goto out;
-
ret = ocfs2_extent_map_get_blocks(inode,
range_start >> sb->s_blocksize_bits,
&phys, NULL, &ext_flags);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0e7aad1b11cc..5cd5f7511dac 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
continue;
}
retry:
- ret = -EINVAL;
mlog(0, "attempting to send begin reco msg to %d\n",
nodenum);
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 54d7843c0211..fc5f780fa235 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode,
* greater than page size, so we have to truncate them
* anyway.
*/
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ unmap_mapping_range(inode->i_mapping,
+ new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
i_size_read(inode), 1);
if (status)
@@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index bc8f32fab964..6c2411c2afcf 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
- journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
* part of the transaction - the inode could have been reclaimed and
* now it is reread from disk.
*/
- if (journal) {
+ if (osb->journal) {
transaction_t *transaction;
tid_t tid;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f15750aac5d..dbf9b9e97d74 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
write_unlock(&journal->j_state_lock);
}
-int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
{
int status = -1;
struct inode *inode = NULL; /* the journal inode */
journal_t *j_journal = NULL;
+ struct ocfs2_journal *journal = NULL;
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
- struct ocfs2_super *osb;
int inode_lock = 0;
- BUG_ON(!journal);
+ /* initialize our journal structure */
+ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+ if (!journal) {
+ mlog(ML_ERROR, "unable to alloc journal\n");
+ status = -ENOMEM;
+ goto done;
+ }
+ osb->journal = journal;
+ journal->j_osb = osb;
- osb = journal->j_osb;
+ atomic_set(&journal->j_num_trans, 0);
+ init_rwsem(&journal->j_trans_barrier);
+ init_waitqueue_head(&journal->j_checkpointed);
+ spin_lock_init(&journal->j_lock);
+ journal->j_trans_id = 1UL;
+ INIT_LIST_HEAD(&journal->j_la_cleanups);
+ INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+ journal->j_state = OCFS2_JOURNAL_FREE;
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
@@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
journal->j_state = OCFS2_JOURNAL_FREE;
-// up_write(&journal->j_trans_barrier);
done:
iput(inode);
+ kfree(journal);
+ osb->journal = NULL;
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1497,10 +1513,7 @@ bail:
if (quota_enabled)
kfree(rm_quota);
- /* no one is callint kthread_stop() for us so the kthread() api
- * requires that we call do_exit(). And it isn't exported, but
- * complete_and_exit() seems to be a minimal wrapper around it. */
- complete_and_exit(NULL, status);
+ return status;
}
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d158acb8b38a..8dcb2f2cadbc 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
-int ocfs2_journal_init(struct ocfs2_journal *journal,
- int *dirty);
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c914ce9b3ac..1286b88b6fa1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
/* This will disable recovery and flush any recovery work. */
ocfs2_recovery_exit(osb);
- ocfs2_journal_shutdown(osb);
-
ocfs2_sync_blockdev(sb);
ocfs2_purge_refcount_trees(osb);
@@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_release_system_inodes(osb);
+ ocfs2_journal_shutdown(osb);
+
/*
* If we're dismounting due to mount error, mount.ocfs2 will clean
* up heartbeat. If we're a local mount, there is no heartbeat.
@@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct ocfs2_journal *journal;
struct ocfs2_super *osb;
u64 total_blocks;
@@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
get_random_bytes(&osb->s_next_generation, sizeof(u32));
- /* FIXME
- * This should be done in ocfs2_journal_init(), but unknown
- * ordering issues will cause the filesystem to crash.
- * If anyone wants to figure out what part of the code
- * refers to osb->journal before ocfs2_journal_init() is run,
- * be my guest.
- */
- /* initialize our journal structure */
-
- journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
- if (!journal) {
- mlog(ML_ERROR, "unable to alloc journal\n");
- status = -ENOMEM;
- goto bail;
- }
- osb->journal = journal;
- journal->j_osb = osb;
-
- atomic_set(&journal->j_num_trans, 0);
- init_rwsem(&journal->j_trans_barrier);
- init_waitqueue_head(&journal->j_checkpointed);
- spin_lock_init(&journal->j_lock);
- journal->j_trans_id = (unsigned long) 1;
- INIT_LIST_HEAD(&journal->j_la_cleanups);
- INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
- journal->j_state = OCFS2_JOURNAL_FREE;
-
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
init_llist_head(&osb->dquot_drop_list);
@@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves. */
/* Init our journal object. */
- status = ocfs2_journal_init(osb->journal, &dirty);
+ status = ocfs2_journal_init(osb, &dirty);
if (status < 0) {
mlog(ML_ERROR, "Could not initialize journal!\n");
goto finally;
@@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->osb_orphan_wipes);
kfree(osb->slot_recovery_generations);
- /* FIXME
- * This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initialize_osb(),
- * we free it here.
- */
- kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
kfree(osb->vol_label);
diff --git a/fs/open.c b/fs/open.c
index daa324606a41..f732fb94600c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f,
* of THPs into the page cache will fail.
*/
smp_mb();
- if (filemap_nr_thps(inode->i_mapping))
- truncate_pagecache(inode, 0);
+ if (filemap_nr_thps(inode->i_mapping)) {
+ struct address_space *mapping = inode->i_mapping;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ /*
+ * unmap_mapping_range just need to be called once
+ * here, because the private pages is not need to be
+ * unmapped mapping (e.g. data segment of dynamic
+ * shared libraries here).
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+ truncate_inode_pages(mapping, 0);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
}
return 0;
@@ -1248,6 +1260,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
if (err)
return err;
+ audit_openat2_how(&tmp);
+
/* O_LARGEFILE is only allowed for non-O_PATH. */
if (!(tmp.flags & O_PATH) && force_o_largefile())
tmp.flags |= O_LARGEFILE;
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index fe484cf93e5c..8bbe9486e3a6 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -26,8 +26,10 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
- if (!new_op)
+ if (!new_op) {
+ ret = -ENOMEM;
goto out_put_parent;
+ }
new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
new_op->upcall.req.lookup.parent_refn = parent->refn;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index c1bb4c4b5d67..e5e3e500ed46 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -10,7 +10,7 @@
* Linux VFS inode operations.
*/
-#include <linux/bvec.h>
+#include <linux/blkdev.h>
#include <linux/fileattr.h>
#include "protocol.h"
#include "orangefs-kernel.h"
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 2f2e430461b2..d90d8addbfc2 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -11,6 +11,7 @@
#include <linux/parser.h>
#include <linux/hashtable.h>
+#include <linux/seq_file.h>
/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
static struct kmem_cache *orangefs_inode_cache;
@@ -475,7 +476,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
const char *devname,
void *data)
{
- int ret = -EINVAL;
+ int ret;
struct super_block *sb = ERR_PTR(-EINVAL);
struct orangefs_kernel_op_s *new_op;
struct dentry *d = ERR_PTR(-EINVAL);
@@ -526,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
if (!ORANGEFS_SB(sb)) {
d = ERR_PTR(-ENOMEM);
- goto free_op;
+ goto free_sb_and_op;
}
ret = orangefs_fill_sb(sb,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 4e7d5bfa2949..b193d08a3dc3 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -140,12 +140,14 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
int err;
err = ovl_real_fileattr_get(old, &oldfa);
- if (err)
- return err;
-
- err = ovl_real_fileattr_get(new, &newfa);
- if (err)
+ if (err) {
+ /* Ntfs-3g returns -EINVAL for "no fileattr support" */
+ if (err == -ENOTTY || err == -EINVAL)
+ return 0;
+ pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
+ old, err);
return err;
+ }
/*
* We cannot set immutable and append-only flags on upper inode,
@@ -159,6 +161,17 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
return err;
}
+ /* Don't bother copying flags if none are set */
+ if (!(oldfa.flags & OVL_COPY_FS_FLAGS_MASK))
+ return 0;
+
+ err = ovl_real_fileattr_get(new, &newfa);
+ if (err) {
+ pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n",
+ new, err);
+ return err;
+ }
+
BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL);
newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK;
newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 93c7c267de93..f18490813170 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -137,8 +137,7 @@ kill_whiteout:
goto out;
}
-static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry,
- umode_t mode)
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
{
int err;
struct dentry *d, *dentry = *newdentry;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index c88ac571593d..fa125feed0ff 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -17,6 +17,7 @@
struct ovl_aio_req {
struct kiocb iocb;
+ refcount_t ref;
struct kiocb *orig_iocb;
struct fd fd;
};
@@ -252,6 +253,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
return flags;
}
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
+{
+ if (refcount_dec_and_test(&aio_req->ref)) {
+ fdput(aio_req->fd);
+ kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ }
+}
+
static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
{
struct kiocb *iocb = &aio_req->iocb;
@@ -268,18 +277,17 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
}
orig_iocb->ki_pos = iocb->ki_pos;
- fdput(aio_req->fd);
- kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ ovl_aio_put(aio_req);
}
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
{
struct ovl_aio_req *aio_req = container_of(iocb,
struct ovl_aio_req, iocb);
struct kiocb *orig_iocb = aio_req->orig_iocb;
ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res, res2);
+ orig_iocb->ki_complete(orig_iocb, res);
}
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -319,7 +327,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
aio_req->orig_iocb = iocb;
kiocb_clone(&aio_req->iocb, iocb, real.file);
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
ovl_aio_cleanup_handler(aio_req);
}
@@ -390,7 +400,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
kiocb_clone(&aio_req->iocb, iocb, real.file);
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
ovl_aio_cleanup_handler(aio_req);
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 832b17589733..1f36158c7dbe 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -610,7 +610,10 @@ int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
if (err)
return err;
- return vfs_fileattr_get(realpath->dentry, fa);
+ err = vfs_fileattr_get(realpath->dentry, fa);
+ if (err == -ENOIOCTLCMD)
+ err = -ENOTTY;
+ return err;
}
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3894f3347955..2cd5741c873b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -570,6 +570,7 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 178daa5e82c9..265181c110ae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -787,10 +787,14 @@ retry:
goto retry;
}
- work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode));
- err = PTR_ERR(work);
- if (IS_ERR(work))
- goto out_err;
+ err = ovl_mkdir_real(dir, &work, attr.ia_mode);
+ if (err)
+ goto out_dput;
+
+ /* Weird filesystem returning with hashed negative (kernfs)? */
+ err = -EINVAL;
+ if (d_really_is_negative(work))
+ goto out_dput;
/*
* Try to remove POSIX ACL xattrs from workdir. We are good if:
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index f5c25f580dd9..9323a854a60a 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* to just call ->get_acl to fetch the ACL ourself. (This is going to
* be an unlikely race.)
*/
- if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
- /* fall through */ ;
+ cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
* Normally, the ACL returned by ->get_acl will be cached.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49be8c8ef555..ff869a66b34e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
cpumask_pr_args(&task->cpus_mask));
}
-static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
- seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
seq_putc(m, '\n');
}
@@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
- task_core_dumping(m, mm);
+ task_core_dumping(m, task);
task_thp_status(m, mm);
mmput(mm);
}
@@ -541,7 +541,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
if (permitted && (!whole || num_threads < 2))
- wchan = get_wchan(task);
+ wchan = !task_is_running(task);
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
@@ -606,10 +606,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
*
* This works with older implementations of procps as well.
*/
- if (wchan)
- seq_puts(m, " 1");
- else
- seq_puts(m, " 0");
+ seq_put_decimal_ull(m, " ", wchan);
seq_put_decimal_ull(m, " ", 0);
seq_put_decimal_ull(m, " ", 0);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 533d5836eb9a..13eda8de2998 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,6 +67,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
@@ -386,17 +387,19 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned long wchan;
+ char symname[KSYM_NAME_LEN];
- if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- wchan = get_wchan(task);
- else
- wchan = 0;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto print0;
- if (wchan)
- seq_printf(m, "%ps", (void *) wchan);
- else
- seq_putc(m, '0');
+ wchan = get_wchan(task);
+ if (wchan && !lookup_symbol_name(wchan, symname)) {
+ seq_puts(m, symname);
+ return 0;
+ }
+print0:
+ seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -1979,19 +1982,21 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
+ int ret = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- inode = d_inode(dentry);
- task = get_proc_task(inode);
+ rcu_read_lock();
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ goto out;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
pid_update_inode(task, inode);
- put_task_struct(task);
- return 1;
+ ret = 1;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static inline bool proc_inode_is_dead(struct inode *inode)
@@ -3799,7 +3804,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
task = next_tid(task), ctx->pos++) {
char name[10 + 1];
unsigned int len;
+
tid = task_pid_nr_ns(task, ns);
+ if (!tid)
+ continue; /* The task has just exited. */
len = snprintf(name, sizeof(name), "%u", tid);
if (!proc_fill_cache(file, ctx, name, len,
proc_task_instantiate, task, NULL)) {
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6561a06ef905..4fb8729a68d4 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -24,7 +24,7 @@
#ifdef arch_idle_time
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
@@ -46,7 +46,7 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
#else
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cf25be3e0321..ad667dbc96f5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,7 +397,6 @@ struct mem_size_stats {
u64 pss_shmem;
u64 pss_locked;
u64 swap_pss;
- bool check_shmem_swap;
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
@@ -478,9 +477,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- mss->swap += shmem_partial_swap_usage(
- walk->vma->vm_file->f_mapping, addr, end);
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
return 0;
}
@@ -488,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
#define smaps_pte_hole NULL
#endif /* CONFIG_SHMEM */
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
+#endif
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -516,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_pfn_swap_entry(swpent))
page = pfn_swap_entry_to_page(swpent);
- } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
- && pte_none(*pte))) {
- page = xa_load(&vma->vm_file->f_mapping->i_pages,
- linear_page_index(vma, addr));
- if (xa_is_value(page))
- mss->swap += PAGE_SIZE;
+ } else {
+ smaps_pte_hole_lookup(addr, walk);
return;
}
@@ -735,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
return;
#ifdef CONFIG_SHMEM
- /* In case of smaps_rollup, reset the value from previous vma */
- mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -754,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
!(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- mss->check_shmem_swap = true;
ops = &smaps_shmem_walk_ops;
}
}
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 5a1b228964fb..deb99bc9b7e6 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -12,18 +12,22 @@ static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
- u64 nsec;
+ u64 idle_nsec;
u32 rem;
int i;
- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
+
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
ktime_get_boottime_ts64(&uptime);
timens_add_boottime(&uptime);
- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9a15334da208..30a3b66f475a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -26,7 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+static DECLARE_RWSEM(vmcore_cb_rwsem);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether we had a surprise unregistration of a callback. */
+static bool vmcore_cb_unstable;
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+void register_vmcore_cb(struct vmcore_cb *cb)
{
- if (oldmem_pfn_is_ram)
- return -EBUSY;
- oldmem_pfn_is_ram = fn;
- return 0;
+ down_write(&vmcore_cb_rwsem);
+ INIT_LIST_HEAD(&cb->next);
+ list_add_tail(&cb->next, &vmcore_cb_list);
+ /*
+ * Registering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., manual driver loading).
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback registration\n");
+ up_write(&vmcore_cb_rwsem);
}
-EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(register_vmcore_cb);
-void unregister_oldmem_pfn_is_ram(void)
+void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- oldmem_pfn_is_ram = NULL;
- wmb();
+ down_write(&vmcore_cb_rwsem);
+ list_del(&cb->next);
+ /*
+ * Unregistering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., forced driver removal), but we cannot stop
+ * unregistering.
+ */
+ if (vmcore_opened) {
+ pr_warn_once("Unexpected vmcore callback unregistration\n");
+ vmcore_cb_unstable = true;
+ }
+ up_write(&vmcore_cb_rwsem);
}
-EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
{
- int (*fn)(unsigned long pfn);
- /* pfn is ram unless fn() checks pagetype */
- int ret = 1;
+ struct vmcore_cb *cb;
+ bool ret = true;
- /*
- * Ask hypervisor if the pfn is really ram.
- * A ballooned page contains no data and reading from such a page
- * will cause high load in the hypervisor.
- */
- fn = oldmem_pfn_is_ram;
- if (fn)
- ret = fn(pfn);
+ lockdep_assert_held_read(&vmcore_cb_rwsem);
+ if (unlikely(vmcore_cb_unstable))
+ return false;
+
+ list_for_each_entry(cb, &vmcore_cb_list, next) {
+ if (unlikely(!cb->pfn_is_ram))
+ continue;
+ ret = cb->pfn_is_ram(cb, pfn);
+ if (!ret)
+ break;
+ }
return ret;
}
+static int open_vmcore(struct inode *inode, struct file *file)
+{
+ down_read(&vmcore_cb_rwsem);
+ vmcore_opened = true;
+ up_read(&vmcore_cb_rwsem);
+
+ return 0;
+}
+
/* Reads a page from the oldmem device from given offset. */
ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf,
@@ -117,6 +146,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
+ down_read(&vmcore_cb_rwsem);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -124,7 +154,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
+ if (!pfn_is_ram(pfn))
memset(buf, 0, nr_bytes);
else {
if (encrypted)
@@ -136,8 +166,10 @@ ssize_t read_from_oldmem(char *buf, size_t count,
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
- if (tmp < 0)
+ if (tmp < 0) {
+ up_read(&vmcore_cb_rwsem);
return tmp;
+ }
}
*ppos += nr_bytes;
count -= nr_bytes;
@@ -147,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = 0;
} while (count);
+ up_read(&vmcore_cb_rwsem);
return read;
}
@@ -177,7 +210,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active());
+ return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -378,7 +411,7 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, mem_encrypt_active());
+ userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
buflen -= tsz;
@@ -537,14 +570,19 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ int ret;
+
/*
* Check if oldmem_pfn_is_ram was registered to avoid
* looping over all pages without a reason.
*/
- if (oldmem_pfn_is_ram)
- return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ down_read(&vmcore_cb_rwsem);
+ if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable)
+ ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
- return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ up_read(&vmcore_cb_rwsem);
+ return ret;
}
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
@@ -668,6 +706,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
#endif
static const struct proc_ops vmcore_proc_ops = {
+ .proc_open = open_vmcore,
.proc_read = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index 04ce58c939a0..5d1fbaffd66a 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
static int __register_pstore_blk(struct pstore_device_info *dev,
const char *devpath)
{
- struct inode *inode;
int ret = -ENODEV;
lockdep_assert_held(&pstore_blk_lock);
@@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev,
goto err;
}
- inode = file_inode(psblk_file);
- if (!S_ISBLK(inode->i_mode)) {
+ if (!S_ISBLK(file_inode(psblk_file)->i_mode)) {
pr_err("'%s' is not block device!\n", devpath);
goto err_fput;
}
- inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
- dev->zone.total_size = i_size_read(inode);
+ dev->zone.total_size =
+ bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host));
ret = __register_pstore_device(dev);
if (ret)
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2bcc9a6f1bfc..052f143e2e0e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,6 +10,7 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <asm/current.h>
+#include <linux/blkdev.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/security.h>
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index d3e995e1046f..5f2405994280 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
quota_error(dquot->dq_sb, "Quota structure has offset to "
"other block (%u) than it should (%u)", blk,
(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+ ret = -EIO;
goto out_buf;
}
ret = read_blk(info, blk, buf);
@@ -479,6 +480,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ newblk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
newblk = 0;
@@ -578,6 +586,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
+ if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
+ quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+ blk, info->dqi_blocks);
+ ret = -EUCLEAN;
+ goto out_buf;
+ }
+
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
else
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 65e7e56005b8..bc66d0173e33 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
#include "internal.h"
struct ramfs_mount_opts {
@@ -203,17 +204,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
int opt;
opt = fs_parse(fc, ramfs_fs_parameters, param, &result);
- if (opt < 0) {
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
/*
* We might like to report bad mount options here;
* but traditionally ramfs has ignored all mount options,
* and as it is used as a !CONFIG_SHMEM simple substitute
* for tmpfs, better continue to ignore other mount options.
*/
- if (opt == -ENOPARAM)
- opt = 0;
- return opt;
+ return 0;
}
+ if (opt < 0)
+ return opt;
switch (opt) {
case Opt_mode:
diff --git a/fs/read_write.c b/fs/read_write.c
index af057c57bdc6..0074afa7ecb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
if (unlikely((ssize_t) count < 0))
return -EINVAL;
- /*
- * ranged mandatory locking does not apply to streams - it makes sense
- * only for files where position has a meaning.
- */
if (ppos) {
loff_t pos = *ppos;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 58481f8d63d5..82e09901462e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s,
if (!strcmp(arg, "auto")) {
/* From JFS code, to auto-get the size. */
- *blocks =
- i_size_read(s->s_bdev->bd_inode) >> s->
- s_blocksize_bits;
+ *blocks = sb_bdev_nr_blocks(s);
} else {
*blocks = simple_strtoul(arg, &p, 0);
if (*p != '\0') {
@@ -1437,7 +1435,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
unsigned long safe_mask = 0;
unsigned int commit_max_age = (unsigned int)-1;
struct reiserfs_journal *journal = SB_JOURNAL(s);
- char *new_opts;
int err;
char *qf_names[REISERFS_MAXQUOTAS];
unsigned int qfmt = 0;
@@ -1445,10 +1442,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
int i;
#endif
- new_opts = kstrdup(arg, GFP_KERNEL);
- if (arg && !new_opts)
- return -ENOMEM;
-
sync_filesystem(s);
reiserfs_write_lock(s);
@@ -1599,7 +1592,6 @@ out_ok_unlocked:
out_err_unlock:
reiserfs_write_unlock(s);
out_err:
- kfree(new_opts);
return err;
}
@@ -1986,9 +1978,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
* smaller than the filesystem. If the check fails then abort and
* scream, because bad stuff will happen otherwise.
*/
- if (s->s_bdev && s->s_bdev->bd_inode
- && i_size_read(s->s_bdev->bd_inode) <
- sb_block_count(rs) * sb_blocksize(rs)) {
+ if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
SWARN(silent, s, "", "Filesystem cannot be "
"mounted because it is bigger than the device");
SWARN(silent, s, "", "You may need to run fsck "
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4a2cda04d3e2..f8e1f4ee87ff 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -383,22 +383,6 @@ void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
}
EXPORT_SYMBOL(seq_escape_mem);
-/**
- * seq_escape - print string into buffer, escaping some characters
- * @m: target buffer
- * @s: string
- * @esc: set of characters that need escaping
- *
- * Puts string into buffer, replacing each occurrence of character from
- * @esc with usual octal escape.
- * Use seq_has_overflowed() to check for errors.
- */
-void seq_escape(struct seq_file *m, const char *s, const char *esc)
-{
- seq_escape_str(m, s, ESCAPE_OCTAL, esc);
-}
-EXPORT_SYMBOL(seq_escape);
-
void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
new file mode 100644
index 000000000000..7ccadcbe684b
--- /dev/null
+++ b/fs/smbfs_common/smb2pdu.h
@@ -0,0 +1,989 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+#ifndef _COMMON_SMB2PDU_H
+#define _COMMON_SMB2PDU_H
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below. Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE 0x0000
+#define SMB2_SESSION_SETUP_HE 0x0001
+#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE 0x0003
+#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE 0x0005
+#define SMB2_CLOSE_HE 0x0006
+#define SMB2_FLUSH_HE 0x0007 /* trivial resp */
+#define SMB2_READ_HE 0x0008
+#define SMB2_WRITE_HE 0x0009
+#define SMB2_LOCK_HE 0x000A
+#define SMB2_IOCTL_HE 0x000B
+#define SMB2_CANCEL_HE 0x000C
+#define SMB2_ECHO_HE 0x000D
+#define SMB2_QUERY_DIRECTORY_HE 0x000E
+#define SMB2_CHANGE_NOTIFY_HE 0x000F
+#define SMB2_QUERY_INFO_HE 0x0010
+#define SMB2_SET_INFO_HE 0x0011
+#define SMB2_OPLOCK_BREAK_HE 0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
+
+#define NUMBER_OF_SMB2_COMMANDS 0x0013
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" : Must be Zero
+ * "BB" : BugBug, Something to check/review/analyze later
+ * "PDU" : "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define __SMB2_HEADER_STRUCTURE_SIZE 64
+#define SMB2_HEADER_STRUCTURE_SIZE \
+ cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE)
+
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
+
+/*
+ * SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */
+#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */
+
+/* See MS-SMB2 section 2.2.1 */
+struct smb2_hdr {
+ __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
+ __le16 StructureSize; /* 64 */
+ __le16 CreditCharge; /* MBZ */
+ __le32 Status; /* Error from server */
+ __le16 Command;
+ __le16 CreditRequest; /* CreditResponse */
+ __le32 Flags;
+ __le32 NextCommand;
+ __le64 MessageId;
+ union {
+ struct {
+ __le32 ProcessId;
+ __le32 TreeId;
+ } __packed SyncId;
+ __le64 AsyncId;
+ } __packed Id;
+ __le64 SessionId;
+ __u8 Signature[16];
+} __packed;
+
+struct smb2_pdu {
+ struct smb2_hdr hdr;
+ __le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+#define SMB3_AES_CCM_NONCE 11
+#define SMB3_AES_GCM_NONCE 12
+
+/* Transform flags (for 3.0 dialect this flag indicates CCM */
+#define TRANSFORM_FLAG_ENCRYPTED 0x0001
+struct smb2_transform_hdr {
+ __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
+ __u8 Signature[16];
+ __u8 Nonce[16];
+ __le32 OriginalMessageSize;
+ __u16 Reserved1;
+ __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
+ __le64 SessionId;
+} __packed;
+
+
+/* See MS-SMB2 2.2.42 */
+struct smb2_compression_transform_hdr_unchained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le16 Length; /* if chained it is length, else offset */
+} __packed;
+
+/* See MS-SMB2 2.2.42.1 */
+#define SMB2_COMPRESSION_FLAG_NONE 0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001
+
+struct compression_payload_header {
+ __le16 CompressionAlgorithm;
+ __le16 Flags;
+ __le32 Length; /* length of compressed playload including field below if present */
+ /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2 */
+struct smb2_compression_transform_hdr_chained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ /* struct compression_payload_header[] */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2.2 */
+struct compression_pattern_payload_v1 {
+ __le16 Pattern;
+ __le16 Reserved1;
+ __le16 Reserved2;
+ __le32 Repetitions;
+} __packed;
+
+/* See MS-SMB2 section 2.2.9.2 */
+/* Context Types */
+#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
+#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
+
+struct tree_connect_contexts {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Data[];
+} __packed;
+
+/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
+struct smb3_blob_data {
+ __le16 BlobSize;
+ __u8 BlobData[];
+} __packed;
+
+/* Valid values for Attr */
+#define SE_GROUP_MANDATORY 0x00000001
+#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002
+#define SE_GROUP_ENABLED 0x00000004
+#define SE_GROUP_OWNER 0x00000008
+#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010
+#define SE_GROUP_INTEGRITY 0x00000020
+#define SE_GROUP_INTEGRITY_ENABLED 0x00000040
+#define SE_GROUP_RESOURCE 0x20000000
+#define SE_GROUP_LOGON_ID 0xC0000000
+
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+
+struct sid_array_data {
+ __le16 SidAttrCount;
+ /* SidAttrList - array of sid_attr_data structs */
+} __packed;
+
+struct luid_attr_data {
+
+} __packed;
+
+/*
+ * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
+ * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
+ */
+
+struct privilege_array_data {
+ __le16 PrivilegeCount;
+ /* array of privilege_data structs */
+} __packed;
+
+struct remoted_identity_tcon_context {
+ __le16 TicketType; /* must be 0x0001 */
+ __le16 TicketSize; /* total size of this struct */
+ __le16 User; /* offset to SID_ATTR_DATA struct with user info */
+ __le16 UserName; /* offset to null terminated Unicode username string */
+ __le16 Domain; /* offset to null terminated Unicode domain name */
+ __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
+ __le16 RestrictedGroups; /* similar to above */
+ __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
+ __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
+ __le16 Owner; /* offset to BLOB_DATA struct */
+ __le16 DefaultDacl; /* offset to BLOB_DATA struct */
+ __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
+ __le16 UserClaims; /* offset to BLOB_DATA struct */
+ __le16 DeviceClaims; /* offset to BLOB_DATA struct */
+ __u8 TicketInfo[]; /* variable length buf - remoted identity data */
+} __packed;
+
+struct smb2_tree_connect_req_extension {
+ __le32 TreeConnectContextOffset;
+ __le16 TreeConnectContextCount;
+ __u8 Reserved[10];
+ __u8 PathName[]; /* variable sized array */
+ /* followed by array of TreeConnectContexts */
+} __packed;
+
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
+#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
+#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
+
+struct smb2_tree_connect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 Flags; /* Flags in SMB3.1.1 */
+ __le16 PathOffset;
+ __le16 PathLength;
+ __u8 Buffer[1]; /* variable length */
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK 0x01
+#define SMB2_SHARE_TYPE_PIPE 0x02
+#define SMB2_SHARE_TYPE_PRINT 0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
+#define SHI1005_FLAGS_DFS 0x00000001
+#define SHI1005_FLAGS_DFS_ROOT 0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
+#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
+#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
+#define SHI1005_FLAGS_ALL 0x0014FF33
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
+
+struct smb2_tree_connect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 16 */
+ __u8 ShareType; /* see below */
+ __u8 Reserved;
+ __le32 ShareFlags; /* see below */
+ __le32 Capabilities; /* see below */
+ __le32 MaximalAccess;
+} __packed;
+
+struct smb2_tree_disconnect_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NEGOTIATE_PROTOCOL See MS-SMB2 section 2.2.3
+ */
+/* SecurityMode flags */
+#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001
+#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001)
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002)
+#define SMB2_SEC_MODE_FLAGS_ALL 0x0003
+
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS 0x00000001
+#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND 0x00100000
+#define SMB2_LARGE_FILES 0x00200000
+
+#define SMB2_CLIENT_GUID_SIZE 16
+#define SMB2_CREATE_GUID_SIZE 16
+
+/* Dialects */
+#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
+#define SMB20_PROT_ID 0x0202
+#define SMB21_PROT_ID 0x0210
+#define SMB2X_PROT_ID 0x02FF
+#define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
+#define BAD_PROT_ID 0xFFFF
+
+#define SMB311_SALT_SIZE 32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
+#define SMB2_PREAUTH_HASH_SIZE 64
+
+/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3)
+#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5)
+#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6)
+#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7)
+#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
+
+struct smb2_neg_context {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
+} __packed;
+
+/*
+ * SaltLength that the server send can be zero, so the only three required
+ * fields (all __le16) end up six bytes total, so the minimum context data len
+ * in the response is six bytes which accounts for
+ *
+ * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
+ */
+#define MIN_PREAUTH_CTXT_DATA_LEN 6
+
+struct smb2_preauth_neg_context {
+ __le16 ContextType; /* 1 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 HashAlgorithmCount; /* 1 */
+ __le16 SaltLength;
+ __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+ __u8 Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003)
+#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004)
+
+/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
+#define MIN_ENCRYPT_CTXT_DATA_LEN 4
+struct smb2_encryption_neg_context {
+ __le16 ContextType; /* 2 */
+ __le16 DataLength;
+ __le32 Reserved;
+ /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
+ __le16 Ciphers[];
+} __packed;
+
+/* See MS-SMB2 2.2.3.1.3 */
+#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000)
+#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
+#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
+#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
+/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
+#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */
+
+/* Compression Flags */
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
+
+struct smb2_compression_capabilities_context {
+ __le16 ContextType; /* 3 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 CompressionAlgorithmCount;
+ __le16 Padding;
+ __le32 Flags;
+ __le16 CompressionAlgorithms[3];
+ __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
+ /* Check if pad needed */
+} __packed;
+
+/*
+ * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
+ * Its struct simply contains NetName, an array of Unicode characters
+ */
+struct smb2_netname_neg_context {
+ __le16 ContextType; /* 5 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 NetName[]; /* hostname of target converted to UCS-2 */
+} __packed;
+
+/*
+ * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
+ * and 2.2.4.1.5
+ */
+
+/* Flags */
+#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001
+
+struct smb2_transport_capabilities_context {
+ __le16 ContextType; /* 6 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le32 Flags;
+ __u32 Pad;
+} __packed;
+
+/*
+ * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * and 2.2.4.1.6
+ */
+
+/* RDMA Transform IDs */
+#define SMB2_RDMA_TRANSFORM_NONE 0x0000
+#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
+#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
+
+struct smb2_rdma_transform_capabilities_context {
+ __le16 ContextType; /* 7 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le16 TransformCount;
+ __u16 Reserved1;
+ __u32 Reserved2;
+ __le16 RDMATransformIds[];
+} __packed;
+
+/*
+ * For signing capabilities context see MS-SMB2 2.2.3.1.7
+ * and 2.2.4.1.7
+ */
+
+/* Signing algorithms */
+#define SIGNING_ALG_HMAC_SHA256 0
+#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0)
+#define SIGNING_ALG_AES_CMAC 1
+#define SIGNING_ALG_AES_CMAC_LE cpu_to_le16(1)
+#define SIGNING_ALG_AES_GMAC 2
+#define SIGNING_ALG_AES_GMAC_LE cpu_to_le16(2)
+
+struct smb2_signing_capabilities {
+ __le16 ContextType; /* 8 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 SigningAlgorithmCount;
+ __le16 SigningAlgorithms[];
+ /* Followed by padding to 8 byte boundary (required by some servers) */
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN 16
+struct smb2_posix_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
+} __packed;
+
+struct smb2_negotiate_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 36 */
+ __le16 DialectCount;
+ __le16 SecurityMode;
+ __le16 Reserved; /* MBZ */
+ __le32 Capabilities;
+ __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
+ /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+ __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+ __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
+ __le16 Reserved2;
+ __le16 Dialects[];
+} __packed;
+
+struct smb2_negotiate_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 65 */
+ __le16 SecurityMode;
+ __le16 DialectRevision;
+ __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
+ __u8 ServerGUID[16];
+ __le32 Capabilities;
+ __le32 MaxTransactSize;
+ __le32 MaxReadSize;
+ __le32 MaxWriteSize;
+ __le64 SystemTime; /* MBZ */
+ __le64 ServerStartTime;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_SESSION_SETUP See MS-SMB2 section 2.2.5
+ */
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
+
+struct smb2_sess_setup_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 25 */
+ __u8 Flags;
+ __u8 SecurityMode;
+ __le32 Capabilities;
+ __le32 Channel;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __le64 PreviousSessionId;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
+#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001)
+#define SMB2_SESSION_FLAG_IS_NULL 0x0002
+#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002)
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004)
+
+struct smb2_sess_setup_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 SessionFlags;
+ __le16 SecurityBufferOffset;
+ __le16 SecurityBufferLength;
+ __u8 Buffer[1]; /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_LOGOFF See MS-SMB2 section 2.2.7
+ */
+struct smb2_logoff_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 4 */
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_CLOSE See MS-SMB2 section 2.2.15
+ */
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
+struct smb2_close_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 PersistentFileId; /* opaque endianness */
+ __le64 VolatileFileId; /* opaque endianness */
+} __packed;
+
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
+struct smb2_close_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* 60 */
+ __le16 Flags;
+ __le32 Reserved;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */
+ __le64 EndOfFile;
+ __le32 Attributes;
+} __packed;
+
+
+/*
+ * SMB2_READ See MS-SMB2 section 2.2.19
+ */
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED 0x01
+#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000)
+#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001)
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002)
+#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003)
+
+/* SMB2 read request without RFC1001 length at the beginning */
+struct smb2_read_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __u8 Padding; /* offset from start of SMB2 header to place read */
+ __u8 Flags; /* MBZ unless SMB3.02 or later */
+ __le32 Length;
+ __le64 Offset;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 MinimumCount;
+ __le32 Channel; /* MBZ except for SMB3 or later */
+ __le32 RemainingBytes;
+ __le16 ReadChannelInfoOffset;
+ __le16 ReadChannelInfoLength;
+ __u8 Buffer[1];
+} __packed;
+
+/* Read flags */
+#define SMB2_READFLAG_RESPONSE_NONE cpu_to_le32(0x00000000)
+#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM cpu_to_le32(0x00000001)
+
+struct smb2_read_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_WRITE See MS-SMB2 section 2.2.21
+ */
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
+
+struct smb2_write_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 49 */
+ __le16 DataOffset; /* offset from start of SMB2 header to write data */
+ __le32 Length;
+ __le64 Offset;
+ __le64 PersistentFileId; /* opaque endianness */
+ __le64 VolatileFileId; /* opaque endianness */
+ __le32 Channel; /* MBZ unless SMB3.02 or later */
+ __le32 RemainingBytes;
+ __le16 WriteChannelInfoOffset;
+ __le16 WriteChannelInfoLength;
+ __le32 Flags;
+ __u8 Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 17 */
+ __u8 DataOffset;
+ __u8 Reserved;
+ __le32 DataLength;
+ __le32 DataRemaining;
+ __u32 Reserved2;
+ __u8 Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_FLUSH See MS-SMB2 section 2.2.17
+ */
+struct smb2_flush_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 24 */
+ __le16 Reserved1;
+ __le32 Reserved2;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+} __packed;
+
+struct smb2_flush_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NOTIFY See MS-SMB2 section 2.2.35
+ */
+/* notify flags */
+#define SMB2_WATCH_TREE 0x0001
+
+/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
+#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE 0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION 0x00000040
+#define FILE_NOTIFY_CHANGE_EA 0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800
+
+/* SMB2 Notify Action Flags */
+#define FILE_ACTION_ADDED 0x00000001
+#define FILE_ACTION_REMOVED 0x00000002
+#define FILE_ACTION_MODIFIED 0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005
+#define FILE_ACTION_ADDED_STREAM 0x00000006
+#define FILE_ACTION_REMOVED_STREAM 0x00000007
+#define FILE_ACTION_MODIFIED_STREAM 0x00000008
+#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009
+
+struct smb2_change_notify_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize;
+ __le16 Flags;
+ __le32 OutputBufferLength;
+ __le64 PersistentFileId; /* opaque endianness */
+ __le64 VolatileFileId; /* opaque endianness */
+ __le32 CompletionFilter;
+ __u32 Reserved;
+} __packed;
+
+struct smb2_change_notify_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 9 */
+ __le16 OutputBufferOffset;
+ __le32 OutputBufferLength;
+ __u8 Buffer[1]; /* array of file notify structs */
+} __packed;
+
+
+/*
+ * SMB2_CREATE See MS-SMB2 section 2.2.13
+ */
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE 0x00
+#define SMB2_OPLOCK_LEVEL_II 0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08
+#define SMB2_OPLOCK_LEVEL_BATCH 0x09
+#define SMB2_OPLOCK_LEVEL_LEASE 0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99
+
+/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
+#define IL_ANONYMOUS cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION cpu_to_le32(0x00000002)
+#define IL_DELEGATE cpu_to_le32(0x00000003)
+
+/* File Attrubutes */
+#define FILE_ATTRIBUTE_READONLY 0x00000001
+#define FILE_ATTRIBUTE_HIDDEN 0x00000002
+#define FILE_ATTRIBUTE_SYSTEM 0x00000004
+#define FILE_ATTRIBUTE_DIRECTORY 0x00000010
+#define FILE_ATTRIBUTE_ARCHIVE 0x00000020
+#define FILE_ATTRIBUTE_NORMAL 0x00000080
+#define FILE_ATTRIBUTE_TEMPORARY 0x00000100
+#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200
+#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400
+#define FILE_ATTRIBUTE_COMPRESSED 0x00000800
+#define FILE_ATTRIBUTE_OFFLINE 0x00001000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
+#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
+#define FILE_ATTRIBUTE__MASK 0x00007FB7
+
+#define FILE_ATTRIBUTE_READONLY_LE cpu_to_le32(0x00000001)
+#define FILE_ATTRIBUTE_HIDDEN_LE cpu_to_le32(0x00000002)
+#define FILE_ATTRIBUTE_SYSTEM_LE cpu_to_le32(0x00000004)
+#define FILE_ATTRIBUTE_DIRECTORY_LE cpu_to_le32(0x00000010)
+#define FILE_ATTRIBUTE_ARCHIVE_LE cpu_to_le32(0x00000020)
+#define FILE_ATTRIBUTE_NORMAL_LE cpu_to_le32(0x00000080)
+#define FILE_ATTRIBUTE_TEMPORARY_LE cpu_to_le32(0x00000100)
+#define FILE_ATTRIBUTE_SPARSE_FILE_LE cpu_to_le32(0x00000200)
+#define FILE_ATTRIBUTE_REPARSE_POINT_LE cpu_to_le32(0x00000400)
+#define FILE_ATTRIBUTE_COMPRESSED_LE cpu_to_le32(0x00000800)
+#define FILE_ATTRIBUTE_OFFLINE_LE cpu_to_le32(0x00001000)
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE cpu_to_le32(0x00002000)
+#define FILE_ATTRIBUTE_ENCRYPTED_LE cpu_to_le32(0x00004000)
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE cpu_to_le32(0x00008000)
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000)
+#define FILE_ATTRIBUTE_MASK_LE cpu_to_le32(0x00007FB7)
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE cpu_to_le32(0x00000001)
+#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004)
+#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE cpu_to_le32(0x00000020)
+#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040)
+#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000)
+#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF)
+
+
+#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \
+ FILE_READ_EA_LE | \
+ FILE_GENERIC_READ_LE)
+#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \
+ FILE_APPEND_DATA_LE | \
+ FILE_WRITE_EA_LE | \
+ FILE_WRITE_ATTRIBUTES_LE | \
+ FILE_GENERIC_WRITE_LE)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE cpu_to_le32(0x00000002)
+#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005)
+#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007)
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \
+ | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+ | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+/* CreateOptions Flags */
+#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001)
+/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */
+#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004)
+#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040)
+#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200)
+#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000)
+#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000)
+#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF)
+
+#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
+ | FILE_READ_ATTRIBUTES_LE)
+#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
+ | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
+#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
+#define SMB2_CREATE_REQUEST_LEASE "RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
+#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
+
+struct create_context {
+ __le32 Next;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le16 Reserved;
+ __le16 DataOffset;
+ __le32 DataLength;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_create_req {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 57 */
+ __u8 SecurityFlags;
+ __u8 RequestedOplockLevel;
+ __le32 ImpersonationLevel;
+ __le64 SmbCreateFlags;
+ __le64 Reserved;
+ __le32 DesiredAccess;
+ __le32 FileAttributes;
+ __le32 ShareAccess;
+ __le32 CreateDisposition;
+ __le32 CreateOptions;
+ __le16 NameOffset;
+ __le16 NameLength;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[];
+} __packed;
+
+struct smb2_create_rsp {
+ struct smb2_hdr hdr;
+ __le16 StructureSize; /* Must be 89 */
+ __u8 OplockLevel;
+ __u8 Flags; /* 0x01 if reparse point */
+ __le32 CreateAction;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 AllocationSize;
+ __le64 EndofFile;
+ __le32 FileAttributes;
+ __le32 Reserved2;
+ __le64 PersistentFileId;
+ __le64 VolatileFileId;
+ __le32 CreateContextsOffset;
+ __le32 CreateContextsLength;
+ __u8 Buffer[1];
+} __packed;
+
+
+#endif /* _COMMON_SMB2PDU_H */
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60d6951915f4..bb44ff4c5cc6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -16,6 +16,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
@@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Check the filesystem does not extend beyond the end of the
block device */
msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
- if (msblk->bytes_used < 0 || msblk->bytes_used >
- i_size_read(sb->s_bdev->bd_inode))
+ if (msblk->bytes_used < 0 ||
+ msblk->bytes_used > bdev_nr_bytes(sb->s_bdev))
goto failed_mount;
/* Check block size for sanity */
diff --git a/fs/super.c b/fs/super.c
index bcef3a6f4c4b..3bfc0f8fbd5b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb)
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
if (sb->s_bdi != &noop_backing_dev_info) {
+ if (sb->s_iflags & SB_I_PERSB_BDI)
+ bdi_unregister(sb->s_bdi);
bdi_put(sb->s_bdi);
sb->s_bdi = &noop_backing_dev_info;
}
@@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
}
WARN_ON(sb->s_bdi != &noop_backing_dev_info);
sb->s_bdi = bdi;
+ sb->s_iflags |= SB_I_PERSB_BDI;
return 0;
}
diff --git a/fs/sync.c b/fs/sync.c
index 1373a610dc78..3ce8e2137f31 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -3,6 +3,7 @@
* High-level sync()-related operations
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
@@ -22,25 +23,6 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
- if (wait)
- sync_inodes_sb(sb);
- else
- writeback_inodes_sb(sb, WB_REASON_SYNC);
-
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, wait);
- return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
* Write out and wait upon all dirty data associated with this
* superblock. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
@@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb)
if (sb_rdonly(sb))
return 0;
- ret = __sync_filesystem(sb, 0);
+ /*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+ * to submit I/O for these buffers via sync_blockdev(). This also
+ * speeds up the wait == 1 case since in that case write_inode()
+ * methods call sync_dirty_buffer() and thus effectively write one block
+ * at a time.
+ */
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 0);
+ ret = sync_blockdev_nowait(sb->s_bdev);
if (ret < 0)
return ret;
- return __sync_filesystem(sb, 1);
+
+ sync_inodes_sb(sb);
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+ return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);
@@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
sb->s_op->sync_fs(sb, *(int *)arg);
}
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
- filemap_fdatawrite(bdev->bd_inode->i_mapping);
-}
-
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
-{
- /*
- * We keep the error status of individual mapping so that
- * applications can catch the writeback error using fsync(2).
- * See filemap_fdatawait_keep_errors() for details.
- */
- filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
-}
-
/*
* Sync everything. We start by waking flusher threads so that most of
* writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -114,8 +96,8 @@ void ksys_sync(void)
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
- iterate_bdevs(fdatawait_one_bdev, NULL);
+ sync_bdevs(false);
+ sync_bdevs(true);
if (unlikely(laptop_mode))
laptop_sync_completion();
}
@@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work)
*/
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
+ sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 59dffd5ca517..b6b6796e1616 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -56,8 +56,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
kobject_get_ownership(kobj, &uid, &gid);
- kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
- S_IRWXU | S_IRUGO | S_IXUGO, uid, gid,
+ kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid,
kobj, ns);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d019d6ac6ad0..42dcf96881b6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -45,6 +45,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
ssize_t count;
char *buf;
+ if (WARN_ON_ONCE(!ops->show))
+ return -EINVAL;
+
/* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
count = seq_get_buf(sf, &buf);
if (count < PAGE_SIZE) {
@@ -53,15 +56,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
}
memset(buf, 0, PAGE_SIZE);
- /*
- * Invoke show(). Control may reach here via seq file lseek even
- * if @ops->show() isn't implemented.
- */
- if (ops->show) {
- count = ops->show(kobj, of->kn->priv, buf);
- if (count < 0)
- return count;
- }
+ count = ops->show(kobj, of->kn->priv, buf);
+ if (count < 0)
+ return count;
/*
* The code works fine with PAGE_SIZE return but it's likely to
@@ -255,59 +252,74 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
};
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
- const struct attribute *attr, bool is_bin,
- umode_t mode, kuid_t uid, kgid_t gid, const void *ns)
+ const struct attribute *attr, umode_t mode, kuid_t uid,
+ kgid_t gid, const void *ns)
{
+ struct kobject *kobj = parent->priv;
+ const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
struct lock_class_key *key = NULL;
- const struct kernfs_ops *ops;
+ const struct kernfs_ops *ops = NULL;
struct kernfs_node *kn;
- loff_t size;
-
- if (!is_bin) {
- struct kobject *kobj = parent->priv;
- const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
-
- /* every kobject with an attribute needs a ktype assigned */
- if (WARN(!sysfs_ops, KERN_ERR
- "missing sysfs attribute operations for kobject: %s\n",
- kobject_name(kobj)))
- return -EINVAL;
-
- if (sysfs_ops->show && sysfs_ops->store) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_rw;
- else
- ops = &sysfs_file_kfops_rw;
- } else if (sysfs_ops->show) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_ro;
- else
- ops = &sysfs_file_kfops_ro;
- } else if (sysfs_ops->store) {
- if (mode & SYSFS_PREALLOC)
- ops = &sysfs_prealloc_kfops_wo;
- else
- ops = &sysfs_file_kfops_wo;
- } else
- ops = &sysfs_file_kfops_empty;
-
- size = PAGE_SIZE;
+
+ /* every kobject with an attribute needs a ktype assigned */
+ if (WARN(!sysfs_ops, KERN_ERR
+ "missing sysfs attribute operations for kobject: %s\n",
+ kobject_name(kobj)))
+ return -EINVAL;
+
+ if (mode & SYSFS_PREALLOC) {
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_prealloc_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_prealloc_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_prealloc_kfops_wo;
} else {
- struct bin_attribute *battr = (void *)attr;
-
- if (battr->mmap)
- ops = &sysfs_bin_kfops_mmap;
- else if (battr->read && battr->write)
- ops = &sysfs_bin_kfops_rw;
- else if (battr->read)
- ops = &sysfs_bin_kfops_ro;
- else if (battr->write)
- ops = &sysfs_bin_kfops_wo;
- else
- ops = &sysfs_file_kfops_empty;
-
- size = battr->size;
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_file_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_file_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_file_kfops_wo;
+ }
+
+ if (!ops)
+ ops = &sysfs_file_kfops_empty;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!attr->ignore_lockdep)
+ key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+
+ kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
+ PAGE_SIZE, ops, (void *)attr, ns, key);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, attr->name);
+ return PTR_ERR(kn);
}
+ return 0;
+}
+
+int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
+ const struct bin_attribute *battr, umode_t mode,
+ kuid_t uid, kgid_t gid, const void *ns)
+{
+ const struct attribute *attr = &battr->attr;
+ struct lock_class_key *key = NULL;
+ const struct kernfs_ops *ops;
+ struct kernfs_node *kn;
+
+ if (battr->mmap)
+ ops = &sysfs_bin_kfops_mmap;
+ else if (battr->read && battr->write)
+ ops = &sysfs_bin_kfops_rw;
+ else if (battr->read)
+ ops = &sysfs_bin_kfops_ro;
+ else if (battr->write)
+ ops = &sysfs_bin_kfops_wo;
+ else
+ ops = &sysfs_file_kfops_empty;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (!attr->ignore_lockdep)
@@ -315,7 +327,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
#endif
kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
- size, ops, (void *)attr, ns, key);
+ battr->size, ops, (void *)attr, ns, key);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, attr->name);
@@ -340,9 +352,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
- return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode,
- uid, gid, ns);
-
+ return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns);
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -385,8 +395,8 @@ int sysfs_add_file_to_group(struct kobject *kobj,
return -ENOENT;
kobject_get_ownership(kobj, &uid, &gid);
- error = sysfs_add_file_mode_ns(parent, attr, false,
- attr->mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid,
+ NULL);
kernfs_put(parent);
return error;
@@ -555,8 +565,8 @@ int sysfs_create_bin_file(struct kobject *kobj,
return -EINVAL;
kobject_get_ownership(kobj, &uid, &gid);
- return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true,
- attr->attr.mode, uid, gid, NULL);
+ return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid,
+ gid, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index f29d62004527..eeb0e3099421 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -61,8 +61,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*attr)->name, mode);
mode &= SYSFS_PREALLOC | 0664;
- error = sysfs_add_file_mode_ns(parent, *attr, false,
- mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
+ gid, NULL);
if (unlikely(error))
break;
}
@@ -90,10 +90,9 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
(*bin_attr)->attr.name, mode);
mode &= SYSFS_PREALLOC | 0664;
- error = sysfs_add_file_mode_ns(parent,
- &(*bin_attr)->attr, true,
- mode,
- uid, gid, NULL);
+ error = sysfs_add_bin_file_mode_ns(parent, *bin_attr,
+ mode, uid, gid,
+ NULL);
if (error)
break;
}
@@ -340,8 +339,8 @@ int sysfs_merge_group(struct kobject *kobj,
kobject_get_ownership(kobj, &uid, &gid);
for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
- error = sysfs_add_file_mode_ns(parent, *attr, false,
- (*attr)->mode, uid, gid, NULL);
+ error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode,
+ uid, gid, NULL);
if (error) {
while (--i >= 0)
kernfs_remove_by_name(parent, (*--attr)->name);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0050cc0c0236..3f28c9af5756 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,9 +28,11 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
* file.c
*/
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
- const struct attribute *attr, bool is_bin,
- umode_t amode, kuid_t uid, kgid_t gid,
- const void *ns);
+ const struct attribute *attr, umode_t amode, kuid_t uid,
+ kgid_t gid, const void *ns);
+int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
+ const struct bin_attribute *battr, umode_t mode,
+ kuid_t uid, kgid_t gid, const void *ns);
/*
* symlink.c
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index cc8e2ed155c8..d1def0771a40 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -474,10 +474,8 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
struct sysv_sb_info *sbi;
struct buffer_head *bh;
- if (440 != sizeof (struct v7_super_block))
- panic("V7 FS: bad super-block size");
- if (64 != sizeof (struct sysv_inode))
- panic("sysv fs: bad i-node size");
+ BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
+ BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
if (!sbi)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 1261e8b41edb..925a621b432e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -432,7 +432,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ /* Do not set bits for OTH */
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
inode->i_op = ops;
inode->i_fop = &simple_dir_operations;
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 22be7aeb96c4..c57b46a352d8 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
- .max_namelen = UBIFS_MAX_NLEN,
};
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index f1094cdcd6cd..46d697172197 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb)
unsigned long udf_get_last_block(struct super_block *sb)
{
- struct block_device *bdev = sb->s_bdev;
- struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk);
+ struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
unsigned long lblock = 0;
/*
@@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
* Try using the device size...
*/
if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0)
- lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
+ lblock = sb_bdev_nr_blocks(sb);
if (lblock)
return lblock - 1;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b2d7c57d0688..34247fba6df9 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
struct udf_inode_info *vati;
uint32_t pos;
struct virtualAllocationTable20 *vat20;
- sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits;
+ sector_t blocks = sb_bdev_nr_blocks(sb);
udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
if (!sbi->s_vat_inode &&
@@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
int ret;
if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
- udf_fixed_to_variable(block) >=
- i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
+ udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb))
return -EAGAIN;
bh = udf_read_tagged(sb, block, block, &ident);
@@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
last[last_count++] = *lastblock - 152;
for (i = 0; i < last_count; i++) {
- if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
- sb->s_blocksize_bits)
+ if (last[i] >= sb_bdev_nr_blocks(sb))
continue;
ret = udf_check_anchor_block(sb, last[i], fileset);
if (ret != -EAGAIN) {
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 54da6d717a06..b987dc2c6851 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -72,10 +72,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
/*
* Zone interfaces
*/
-
-#define kmem_zone kmem_cache
-#define kmem_zone_t struct kmem_cache
-
static inline struct page *
kmem_to_page(void *addr)
{
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 005abfd9fd34..d7d875cef07a 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -850,7 +850,7 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- __xfs_bmap_add_free(*tpp, args.fsbno, delta, NULL, true);
+ __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
/*
* Roll the transaction before trying to re-init the per-ag
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4c6f9045baca..3f597cad2c33 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -116,23 +116,29 @@ void xfs_perag_put(struct xfs_perag *pag);
/*
* Perag iteration APIs
- *
- * XXX: for_each_perag_range() usage really needs an iterator to clean up when
- * we terminate at end_agno because we may have taken a reference to the perag
- * beyond end_agno. Right now callers have to be careful to catch and clean that
- * up themselves. This is not necessary for the callers of for_each_perag() and
- * for_each_perag_from() because they terminate at sb_agcount where there are
- * no perag structures in tree beyond end_agno.
*/
-#define for_each_perag_range(mp, next_agno, end_agno, pag) \
- for ((pag) = xfs_perag_get((mp), (next_agno)); \
- (pag) != NULL && (next_agno) <= (end_agno); \
- (next_agno) = (pag)->pag_agno + 1, \
- xfs_perag_put(pag), \
- (pag) = xfs_perag_get((mp), (next_agno)))
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_perag *pag,
+ xfs_agnumber_t *agno,
+ xfs_agnumber_t end_agno)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ *agno = pag->pag_agno + 1;
+ xfs_perag_put(pag);
+ if (*agno > end_agno)
+ return NULL;
+ return xfs_perag_get(mp, *agno);
+}
+
+#define for_each_perag_range(mp, agno, end_agno, pag) \
+ for ((pag) = xfs_perag_get((mp), (agno)); \
+ (pag) != NULL; \
+ (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
-#define for_each_perag_from(mp, next_agno, pag) \
- for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag))
+#define for_each_perag_from(mp, agno, pag) \
+ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
#define for_each_perag(mp, agno, pag) \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 2aa2b3484c28..fe94058d4e9e 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -91,7 +91,8 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+ return XFS_TEST_ERROR(avail < orig / 10 ||
+ avail < pag->pag_mount->m_agbtree_maxlevels,
pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 95157f5a5a6c..353e53b892e6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -27,7 +27,7 @@
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
-extern kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
@@ -426,8 +426,8 @@ xfs_alloc_fix_len(
*/
STATIC int /* error code */
xfs_alloc_fixup_trees(
- xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
- xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
+ struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
+ struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
xfs_agblock_t fbno, /* starting block of free extent */
xfs_extlen_t flen, /* length of free extent */
xfs_agblock_t rbno, /* starting block of returned extent */
@@ -488,8 +488,8 @@ xfs_alloc_fixup_trees(
struct xfs_btree_block *bnoblock;
struct xfs_btree_block *cntblock;
- bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
- cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
@@ -1200,8 +1200,8 @@ xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
- xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
- xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+ struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
+ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
int error;
xfs_agblock_t fbno; /* start block of found extent */
xfs_extlen_t flen; /* length of found extent */
@@ -1512,7 +1512,7 @@ xfs_alloc_ag_vextent_lastblock(
* than minlen.
*/
if (*len || args->alignment > 1) {
- acur->cnt->bc_ptrs[0] = 1;
+ acur->cnt->bc_levels[0].ptr = 1;
do {
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
@@ -1658,8 +1658,8 @@ xfs_alloc_ag_vextent_size(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
struct xfs_agf *agf = args->agbp->b_addr;
- xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
+ struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
+ struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
int error; /* error result */
xfs_agblock_t fbno; /* start of found freespace */
xfs_extlen_t flen; /* length of found freespace */
@@ -2190,14 +2190,15 @@ xfs_free_ag_extent(
*/
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
+ mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
(mp->m_sb.sb_agblocks + 1) / 2);
+ ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
}
/*
@@ -2255,14 +2256,14 @@ xfs_alloc_min_freelist(
const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
- ASSERT(mp->m_ag_maxlevels > 0);
+ ASSERT(mp->m_alloc_maxlevels > 0);
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
@@ -2439,7 +2440,7 @@ xfs_agfl_reset(
/*
* Defer an AGFL block free. This is effectively equivalent to
- * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ * xfs_free_extent_later() with some special handling particular to AGFL blocks.
*
* Deferring AGFL frees helps prevent log reservation overruns due to too many
* allocation operations in a transaction. AGFL frees are prone to this problem
@@ -2458,21 +2459,74 @@ xfs_defer_agfl_block(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent_free_item *new; /* new element */
- ASSERT(xfs_bmap_free_item_zone != NULL);
+ ASSERT(xfs_extfree_item_cache != NULL);
ASSERT(oinfo != NULL);
- new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
- new->xefi_oinfo = *oinfo;
- new->xefi_skip_discard = false;
+ new->xefi_owner = oinfo->oi_owner;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
}
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+__xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
+{
+ struct xfs_extent_free_item *new; /* new element */
+#ifdef DEBUG
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_extfree_item_cache != NULL);
+
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (skip_discard)
+ new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (oinfo) {
+ ASSERT(oinfo->oi_offset == 0);
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ new->xefi_owner = oinfo->oi_owner;
+ } else {
+ new->xefi_owner = XFS_RMAP_OWN_NULL;
+ }
+ trace_xfs_bmap_free_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+}
+
#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
@@ -2903,13 +2957,16 @@ xfs_agf_verify(
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > mp->m_ag_maxlevels ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels)
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
+ mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
+ mp->m_alloc_maxlevels)
return __this_address;
if (xfs_has_rmapbt(mp) &&
(be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels))
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
+ mp->m_rmap_maxlevels))
return __this_address;
if (xfs_has_rmapbt(mp) &&
@@ -3495,3 +3552,20 @@ xfs_agfl_walk(
return 0;
}
+
+int __init
+xfs_extfree_intent_init_cache(void)
+{
+ xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
+
+ return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_extfree_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_extfree_item_cache);
+ xfs_extfree_item_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index df4aefaf0046..1c14a0b1abea 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -98,7 +98,7 @@ unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
@@ -248,4 +248,40 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
+void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
+ bool skip_discard);
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+struct xfs_extent_free_item {
+ struct list_head xefi_list;
+ uint64_t xefi_owner;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ unsigned int xefi_flags;
+};
+
+#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
+#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
+#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+
+static inline void
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ __xfs_free_extent_later(tp, bno, len, oinfo, false);
+}
+
+
+extern struct kmem_cache *xfs_extfree_item_cache;
+
+int __init xfs_extfree_intent_init_cache(void);
+void xfs_extfree_intent_destroy_cache(void);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 6746fd735550..8c9f73cc0bee 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
xfs_allocbt_dup_cursor(
@@ -316,7 +317,7 @@ xfs_allocbt_verify(
if (pag && pag->pagf_init) {
if (level >= pag->pagf_levels[btnum])
return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ } else if (level >= mp->m_alloc_maxlevels)
return __this_address;
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
@@ -477,12 +478,8 @@ xfs_allocbt_init_common(
ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
+ xfs_allocbt_cur_cache);
cur->bc_ag.abt.active = false;
if (btnum == XFS_BTNUM_CNT) {
@@ -571,6 +568,17 @@ xfs_allocbt_commit_staged_btree(
}
}
+/* Calculate number of records in an alloc btree block. */
+static inline unsigned int
+xfs_allocbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
+
/*
* Calculate number of records in an alloc btree block.
*/
@@ -581,10 +589,26 @@ xfs_allocbt_maxrecs(
int leaf)
{
blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+ return xfs_allocbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_alloc_rec_t);
- return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+/* Free space btrees are at their largest when every other block is free. */
+#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2)
+
+/* Compute the max possible height for free space btrees. */
+unsigned int
+xfs_allocbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS);
}
/* Calculate the freespace btree size for some records. */
@@ -595,3 +619,22 @@ xfs_allocbt_calc_size(
{
return xfs_btree_calc_size(mp->m_alloc_mnr, len);
}
+
+int __init
+xfs_allocbt_init_cur_cache(void)
+{
+ xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur",
+ xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_allocbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_allocbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_allocbt_cur_cache);
+ xfs_allocbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 2f6b816aaf9f..45df893ef6bb 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -60,4 +60,9 @@ extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_allocbt_maxlevels_ondisk(void);
+
+int __init xfs_allocbt_init_cur_cache(void);
+void xfs_allocbt_destroy_cur_cache(void);
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index e1d11e314228..014daa8c542d 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -770,7 +770,7 @@ xfs_attr_fork_remove(
ASSERT(ip->i_afp->if_nextents == 0);
xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
ip->i_afp = NULL;
ip->i_forkoff = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b48230f1a361..4dccd4d90622 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -37,8 +37,7 @@
#include "xfs_icache.h"
#include "xfs_iomap.h"
-
-kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_bmap_intent_cache;
/*
* Miscellaneous helper functions
@@ -93,6 +92,7 @@ xfs_bmap_compute_maxlevels(
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
}
mp->m_bm_maxlevels[whichfork] = level;
+ ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
}
unsigned int
@@ -239,11 +239,11 @@ xfs_bmap_get_bp(
if (!cur)
return NULL;
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- if (!cur->bc_bufs[i])
+ for (i = 0; i < cur->bc_maxlevels; i++) {
+ if (!cur->bc_levels[i].bp)
break;
- if (xfs_buf_daddr(cur->bc_bufs[i]) == bno)
- return cur->bc_bufs[i];
+ if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno)
+ return cur->bc_levels[i].bp;
}
/* Chase down all the log items to see if the bp is there */
@@ -316,7 +316,7 @@ xfs_check_block(
*/
STATIC void
xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
+ struct xfs_btree_cur *cur, /* btree cursor or null */
xfs_inode_t *ip, /* incore inode pointer */
int whichfork) /* data or attr fork */
{
@@ -522,56 +522,6 @@ xfs_bmap_validate_ret(
#endif /* DEBUG */
/*
- * bmap free list manipulation functions
- */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-__xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
-{
- struct xfs_extent_free_item *new; /* new element */
-#ifdef DEBUG
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
-
- new = kmem_cache_alloc(xfs_bmap_free_item_zone,
- GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
- if (oinfo)
- new->xefi_oinfo = *oinfo;
- else
- new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
- new->xefi_skip_discard = skip_discard;
- trace_xfs_bmap_free_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
-}
-
-/*
* Inode fork format manipulation functions
*/
@@ -625,12 +575,12 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
- xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo);
+ xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
+ if (cur->bc_levels[0].bp == cbp)
+ cur->bc_levels[0].bp = NULL;
xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
@@ -925,7 +875,7 @@ xfs_bmap_add_attrfork_btree(
int *flags) /* inode logging flags */
{
struct xfs_btree_block *block = ip->i_df.if_broot;
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
xfs_mount_t *mp; /* file system mount struct */
int stat; /* newroot status */
@@ -968,7 +918,7 @@ xfs_bmap_add_attrfork_extents(
struct xfs_inode *ip, /* incore inode pointer */
int *flags) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* bmap btree cursor */
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
int error; /* error return value */
if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
@@ -1988,11 +1938,11 @@ xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ struct xfs_btree_cur **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
int i; /* temp state */
struct xfs_ifork *ifp; /* inode fork pointer */
@@ -5045,7 +4995,7 @@ xfs_bmap_del_extent_real(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t *cur, /* if null, not a btree */
+ struct xfs_btree_cur *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
@@ -5296,7 +5246,7 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- __xfs_bmap_add_free(tp, del->br_startblock,
+ __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
@@ -6189,7 +6139,7 @@ __xfs_bmap_add(
bmap->br_blockcount,
bmap->br_state);
- bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
@@ -6300,3 +6250,20 @@ xfs_bmap_validate_extent(
return __this_address;
return NULL;
}
+
+int __init
+xfs_bmap_intent_init_cache(void)
+{
+ xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent",
+ sizeof(struct xfs_bmap_intent),
+ 0, 0, NULL);
+
+ return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_bmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_bmap_intent_cache);
+ xfs_bmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 67641f669918..03d9aaf87413 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -13,8 +13,6 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
-extern kmem_zone_t *xfs_bmap_free_item_zone;
-
/*
* Argument structure for xfs_bmap_alloc.
*/
@@ -44,19 +42,6 @@ struct xfs_bmalloca {
int flags;
};
-/*
- * List of extents to be free "later".
- * The list is kept sorted on xbf_startblock.
- */
-struct xfs_extent_free_item
-{
- xfs_fsblock_t xefi_startblock;/* starting fs block number */
- xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
- bool xefi_skip_discard;
- struct list_head xefi_list;
- struct xfs_owner_info xefi_oinfo; /* extent owner */
-};
-
#define XFS_BMAP_MAX_NMAP 4
/*
@@ -189,9 +174,6 @@ unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
-void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
- xfs_filblks_t len, const struct xfs_owner_info *oinfo,
- bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -239,16 +221,6 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new, int *logflagsp);
-static inline void
-xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo)
-{
- __xfs_bmap_add_free(tp, bno, len, oinfo, false);
-}
-
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
XFS_BMAP_UNMAP,
@@ -257,8 +229,8 @@ enum xfs_bmap_intent_type {
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
- struct xfs_inode *bi_owner;
int bi_whichfork;
+ struct xfs_inode *bi_owner;
struct xfs_bmbt_irec bi_bmap;
};
@@ -290,4 +262,9 @@ int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
int flags);
+extern struct kmem_cache *xfs_bmap_intent_cache;
+
+int __init xfs_bmap_intent_init_cache(void);
+void xfs_bmap_intent_destroy_cache(void);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 72444b8b38a6..453309fc85f2 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_trace.h"
#include "xfs_rmap.h"
+static struct kmem_cache *xfs_bmbt_cur_cache;
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -286,7 +288,7 @@ xfs_bmbt_free_block(
struct xfs_owner_info oinfo;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
- xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -552,13 +554,9 @@ xfs_bmbt_init_cursor(
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
+ mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_btnum = XFS_BTNUM_BMAP;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
@@ -575,6 +573,17 @@ xfs_bmbt_init_cursor(
return cur;
}
+/* Calculate number of records in a block mapping btree block. */
+static inline unsigned int
+xfs_bmbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
/*
* Calculate number of records in a bmap btree block.
*/
@@ -585,10 +594,24 @@ xfs_bmbt_maxrecs(
int leaf)
{
blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+ return xfs_bmbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_bmbt_rec_t);
- return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+/* Compute the max possible height for block mapping btrees. */
+unsigned int
+xfs_bmbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
+
+ /* One extra level for the inode root. */
+ return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
}
/*
@@ -654,3 +677,22 @@ xfs_bmbt_calc_size(
{
return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
}
+
+int __init
+xfs_bmbt_init_cur_cache(void)
+{
+ xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur",
+ xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_bmbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_bmbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_bmbt_cur_cache);
+ xfs_bmbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 729e3bc569be..3e7a40a83835 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -110,4 +110,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+unsigned int xfs_bmbt_maxlevels_ondisk(void);
+
+int __init xfs_bmbt_init_cur_cache(void);
+void xfs_bmbt_destroy_cur_cache(void);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 298395481713..b4e19aacb9de 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -22,11 +22,11 @@
#include "xfs_log.h"
#include "xfs_btree_staging.h"
#include "xfs_ag.h"
-
-/*
- * Cursor allocation zone.
- */
-kmem_zone_t *xfs_btree_cur_zone;
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
/*
* Btree magic numbers.
@@ -367,8 +367,8 @@ xfs_btree_del_cursor(
* way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ if (cur->bc_levels[i].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
else if (!error)
break;
}
@@ -379,7 +379,7 @@ xfs_btree_del_cursor(
kmem_free(cur->bc_ops);
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
xfs_perag_put(cur->bc_ag.pag);
- kmem_cache_free(xfs_btree_cur_zone, cur);
+ kmem_cache_free(cur->bc_cache, cur);
}
/*
@@ -388,14 +388,14 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
struct xfs_buf *bp; /* btree block's buffer pointer */
int error; /* error return value */
int i; /* level number of btree block */
xfs_mount_t *mp; /* mount structure for filesystem */
- xfs_btree_cur_t *new; /* new cursor value */
+ struct xfs_btree_cur *new; /* new cursor value */
xfs_trans_t *tp; /* transaction pointer, can be NULL */
tp = cur->bc_tp;
@@ -415,9 +415,9 @@ xfs_btree_dup_cursor(
* For each level current, re-get the buffer and copy the ptr value.
*/
for (i = 0; i < new->bc_nlevels; i++) {
- new->bc_ptrs[i] = cur->bc_ptrs[i];
- new->bc_ra[i] = cur->bc_ra[i];
- bp = cur->bc_bufs[i];
+ new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
+ new->bc_levels[i].ra = cur->bc_levels[i].ra;
+ bp = cur->bc_levels[i].bp;
if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
xfs_buf_daddr(bp), mp->m_bsize,
@@ -429,7 +429,7 @@ xfs_btree_dup_cursor(
return error;
}
}
- new->bc_bufs[i] = bp;
+ new->bc_levels[i].bp = bp;
}
*ncur = new;
return 0;
@@ -681,7 +681,7 @@ xfs_btree_get_block(
return xfs_btree_get_iroot(cur);
}
- *bpp = cur->bc_bufs[level];
+ *bpp = cur->bc_levels[level].bp;
return XFS_BUF_TO_BLOCK(*bpp);
}
@@ -691,7 +691,7 @@ xfs_btree_get_block(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
@@ -711,7 +711,7 @@ xfs_btree_firstrec(
/*
* Set the ptr value to 1, that's the first record/key.
*/
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
return 1;
}
@@ -721,7 +721,7 @@ xfs_btree_firstrec(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_lastrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
@@ -741,7 +741,7 @@ xfs_btree_lastrec(
/*
* Set the ptr value to numrecs, that's the last record/key.
*/
- cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
return 1;
}
@@ -922,11 +922,11 @@ xfs_btree_readahead(
(lev == cur->bc_nlevels - 1))
return 0;
- if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
return 0;
- cur->bc_ra[lev] |= lr;
- block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+ cur->bc_levels[lev].ra |= lr;
+ block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return xfs_btree_readahead_lblock(cur, lr, block);
@@ -985,28 +985,28 @@ xfs_btree_readahead_ptr(
*/
STATIC void
xfs_btree_setbuf(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int lev, /* level in btree */
struct xfs_buf *bp) /* new buffer to set */
{
struct xfs_btree_block *b; /* btree block */
- if (cur->bc_bufs[lev])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
- cur->bc_bufs[lev] = bp;
- cur->bc_ra[lev] = 0;
+ if (cur->bc_levels[lev].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
+ cur->bc_levels[lev].bp = bp;
+ cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
} else {
if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
}
}
@@ -1548,7 +1548,7 @@ xfs_btree_increment(
#endif
/* We're done if we remain in the block after the increment. */
- if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
goto out1;
/* Fail if we just went off the right edge of the tree. */
@@ -1571,7 +1571,7 @@ xfs_btree_increment(
goto error0;
#endif
- if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
break;
/* Read-ahead the right block for the next loop. */
@@ -1598,14 +1598,14 @@ xfs_btree_increment(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = 1;
+ cur->bc_levels[lev].ptr = 1;
}
out1:
*stat = 1;
@@ -1641,7 +1641,7 @@ xfs_btree_decrement(
xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
/* We're done if we remain in the block after the decrement. */
- if (--cur->bc_ptrs[level] > 0)
+ if (--cur->bc_levels[level].ptr > 0)
goto out1;
/* Get a pointer to the btree block. */
@@ -1665,7 +1665,7 @@ xfs_btree_decrement(
* Stop when we don't go off the left edge of a block.
*/
for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
+ if (--cur->bc_levels[lev].ptr > 0)
break;
/* Read-ahead the left block for the next loop. */
xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
@@ -1691,13 +1691,13 @@ xfs_btree_decrement(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
}
out1:
*stat = 1;
@@ -1735,7 +1735,7 @@ xfs_btree_lookup_get_block(
*
* Otherwise throw it away and get a new one.
*/
- bp = cur->bc_bufs[level];
+ bp = cur->bc_levels[level].bp;
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
if (error)
return error;
@@ -1864,7 +1864,7 @@ xfs_btree_lookup(
return -EFSCORRUPTED;
}
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
*stat = 0;
return 0;
}
@@ -1916,7 +1916,7 @@ xfs_btree_lookup(
if (error)
goto error0;
- cur->bc_ptrs[level] = keyno;
+ cur->bc_levels[level].ptr = keyno;
}
}
@@ -1933,7 +1933,7 @@ xfs_btree_lookup(
!xfs_btree_ptr_is_null(cur, &ptr)) {
int i;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
@@ -1944,7 +1944,7 @@ xfs_btree_lookup(
}
} else if (dir == XFS_LOOKUP_LE && diff > 0)
keyno--;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
@@ -2104,7 +2104,7 @@ __xfs_btree_updkeys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
nlkey = xfs_btree_key_addr(cur, ptr, block);
nhkey = xfs_btree_high_key_addr(cur, ptr, block);
if (!force_all &&
@@ -2171,7 +2171,7 @@ xfs_btree_update_keys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
kp = xfs_btree_key_addr(cur, ptr, block);
xfs_btree_copy_keys(cur, kp, &key, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
@@ -2205,7 +2205,7 @@ xfs_btree_update(
goto error0;
#endif
/* Get the address of the rec to be updated. */
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
rp = xfs_btree_rec_addr(cur, ptr, block);
/* Fill in the new contents and log them. */
@@ -2280,7 +2280,7 @@ xfs_btree_lshift(
* If the cursor entry is the one that would be moved, don't
* do it... it's too complicated.
*/
- if (cur->bc_ptrs[level] <= 1)
+ if (cur->bc_levels[level].ptr <= 1)
goto out0;
/* Set up the left neighbor as "left". */
@@ -2414,7 +2414,7 @@ xfs_btree_lshift(
goto error0;
/* Slide the cursor value left one. */
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
*stat = 1;
return 0;
@@ -2476,7 +2476,7 @@ xfs_btree_rshift(
* do it... it's too complicated.
*/
lrecs = xfs_btree_get_numrecs(left);
- if (cur->bc_ptrs[level] >= lrecs)
+ if (cur->bc_levels[level].ptr >= lrecs)
goto out0;
/* Set up the right neighbor as "right". */
@@ -2664,7 +2664,7 @@ __xfs_btree_split(
*/
lrecs = xfs_btree_get_numrecs(left);
rrecs = lrecs / 2;
- if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
rrecs++;
src_index = (lrecs - rrecs + 1);
@@ -2760,9 +2760,9 @@ __xfs_btree_split(
* If it's just pointing past the last entry in left, then we'll
* insert there, so don't change anything in that case.
*/
- if (cur->bc_ptrs[level] > lrecs + 1) {
+ if (cur->bc_levels[level].ptr > lrecs + 1) {
xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= lrecs;
+ cur->bc_levels[level].ptr -= lrecs;
}
/*
* If there are more levels, we'll need another cursor which refers
@@ -2772,7 +2772,7 @@ __xfs_btree_split(
error = xfs_btree_dup_cursor(cur, curp);
if (error)
goto error0;
- (*curp)->bc_ptrs[level + 1]++;
+ (*curp)->bc_levels[level + 1].ptr++;
}
*ptrp = rptr;
*stat = 1;
@@ -2933,7 +2933,8 @@ xfs_btree_new_iroot(
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
cur->bc_nlevels++;
- cur->bc_ptrs[level + 1] = 1;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ cur->bc_levels[level + 1].ptr = 1;
kp = xfs_btree_key_addr(cur, 1, block);
ckp = xfs_btree_key_addr(cur, 1, cblock);
@@ -3094,8 +3095,9 @@ xfs_btree_new_root(
/* Fix up the cursor. */
xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_levels[cur->bc_nlevels].ptr = nptr;
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
*stat = 1;
return 0;
error0:
@@ -3152,7 +3154,7 @@ xfs_btree_make_block_unfull(
return error;
if (*stat) {
- *oindex = *index = cur->bc_ptrs[level];
+ *oindex = *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3167,7 +3169,7 @@ xfs_btree_make_block_unfull(
return error;
- *index = cur->bc_ptrs[level];
+ *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3214,7 +3216,7 @@ xfs_btree_insrec(
}
/* If we're off the left edge, return failure. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3557,7 +3559,7 @@ xfs_btree_kill_iroot(
if (error)
return error;
- cur->bc_bufs[level - 1] = NULL;
+ cur->bc_levels[level - 1].bp = NULL;
be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
@@ -3590,8 +3592,8 @@ xfs_btree_kill_root(
if (error)
return error;
- cur->bc_bufs[level] = NULL;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = NULL;
+ cur->bc_levels[level].ra = 0;
cur->bc_nlevels--;
return 0;
@@ -3650,7 +3652,7 @@ xfs_btree_delrec(
tcur = NULL;
/* Get the index of the entry being deleted, check for nothing there. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3960,7 +3962,7 @@ xfs_btree_delrec(
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
tcur = NULL;
if (level == 0)
- cur->bc_ptrs[0]++;
+ cur->bc_levels[0].ptr++;
*stat = 1;
return 0;
@@ -4097,9 +4099,9 @@ xfs_btree_delrec(
* cursor to the left block, and fix up the index.
*/
if (bp != lbp) {
- cur->bc_bufs[level] = lbp;
- cur->bc_ptrs[level] += lrecs;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = lbp;
+ cur->bc_levels[level].ptr += lrecs;
+ cur->bc_levels[level].ra = 0;
}
/*
* If we joined with the right neighbor and there's a level above
@@ -4119,16 +4121,16 @@ xfs_btree_delrec(
* We can't use decrement because it would change the next level up.
*/
if (level > 0)
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
/*
* We combined blocks, so we have to update the parent keys if the
- * btree supports overlapped intervals. However, bc_ptrs[level + 1]
- * points to the old block so that the caller knows which record to
- * delete. Therefore, the caller must be savvy enough to call updkeys
- * for us if we return stat == 2. The other exit points from this
- * function don't require deletions further up the tree, so they can
- * call updkeys directly.
+ * btree supports overlapped intervals. However,
+ * bc_levels[level + 1].ptr points to the old block so that the caller
+ * knows which record to delete. Therefore, the caller must be savvy
+ * enough to call updkeys for us if we return stat == 2. The other
+ * exit points from this function don't require deletions further up
+ * the tree, so they can call updkeys directly.
*/
/* Return value means the next level up has something to do. */
@@ -4182,7 +4184,7 @@ xfs_btree_delete(
if (i == 0) {
for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
+ if (cur->bc_levels[level].ptr == 0) {
error = xfs_btree_decrement(cur, level, &i);
if (error)
goto error0;
@@ -4213,7 +4215,7 @@ xfs_btree_get_rec(
int error; /* error return value */
#endif
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
block = xfs_btree_get_block(cur, 0, &bp);
#ifdef DEBUG
@@ -4512,21 +4514,76 @@ xfs_btree_sblock_verify(
}
/*
- * Calculate the number of btree levels needed to store a given number of
- * records in a short-format btree.
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * height of the tree needed to index the number of leaf records.
*/
-uint
+unsigned int
xfs_btree_compute_maxlevels(
- uint *limits,
- unsigned long len)
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned int height = 1;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ height++;
+ }
+
+ return height;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * number of blocks needed to index the given number of leaf records.
+ */
+unsigned long long
+xfs_btree_calc_size(
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned long long blocks = level_blocks;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ blocks += level_blocks;
+ }
+
+ return blocks;
+}
+
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+ const unsigned int *limits,
+ unsigned long long leaf_blocks)
{
- uint level;
- unsigned long maxblocks;
+ unsigned long long node_blocks = limits[1];
+ unsigned long long blocks_left = leaf_blocks - 1;
+ unsigned int height = 1;
+
+ if (leaf_blocks < 1)
+ return 0;
+
+ while (node_blocks < blocks_left) {
+ blocks_left -= node_blocks;
+ node_blocks *= limits[1];
+ height++;
+ }
- maxblocks = (len + limits[0] - 1) / limits[0];
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + limits[1] - 1) / limits[1];
- return level;
+ return height;
}
/*
@@ -4661,23 +4718,25 @@ xfs_btree_overlapped_query_range(
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
pop_up:
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
if (level == 0) {
/* Handle a leaf node. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
@@ -4700,14 +4759,15 @@ pop_up:
/* Record is larger than high key; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* Handle an internal node. */
- lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
- hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
+ block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
@@ -4730,13 +4790,13 @@ pop_up:
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
continue;
} else if (hdiff < 0) {
/* The low key is larger than the upper range; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
}
out:
@@ -4747,13 +4807,14 @@ out:
* with a zero-results range query, so release the buffers if we
* failed to return any results.
*/
- if (cur->bc_bufs[0] == NULL) {
+ if (cur->bc_levels[0].bp == NULL) {
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i]) {
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
- cur->bc_bufs[i] = NULL;
- cur->bc_ptrs[i] = 0;
- cur->bc_ra[i] = 0;
+ if (cur->bc_levels[i].bp) {
+ xfs_trans_brelse(cur->bc_tp,
+ cur->bc_levels[i].bp);
+ cur->bc_levels[i].bp = NULL;
+ cur->bc_levels[i].ptr = 0;
+ cur->bc_levels[i].ra = 0;
}
}
}
@@ -4816,29 +4877,6 @@ xfs_btree_query_all(
return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
}
-/*
- * Calculate the number of blocks needed to store a given number of records
- * in a short-format (per-AG metadata) btree.
- */
-unsigned long long
-xfs_btree_calc_size(
- uint *limits,
- unsigned long long len)
-{
- int level;
- int maxrecs;
- unsigned long long rval;
-
- maxrecs = limits[0];
- for (level = 0, rval = 0; len > 1; level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- maxrecs = limits[1];
- rval += len;
- }
- return rval;
-}
-
static int
xfs_btree_count_blocks_helper(
struct xfs_btree_cur *cur,
@@ -4915,7 +4953,7 @@ xfs_btree_has_more_records(
block = xfs_btree_get_block(cur, 0, &bp);
/* There are still records in this block. */
- if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+ if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
return true;
/* There are more record blocks. */
@@ -4924,3 +4962,42 @@ xfs_btree_has_more_records(
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
}
+
+/* Set up all the btree cursor caches. */
+int __init
+xfs_btree_init_cur_caches(void)
+{
+ int error;
+
+ error = xfs_allocbt_init_cur_cache();
+ if (error)
+ return error;
+ error = xfs_inobt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_bmbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_refcountbt_init_cur_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_btree_destroy_cur_caches();
+ return error;
+}
+
+/* Destroy all the btree cursor caches, if they've been allocated. */
+void
+xfs_btree_destroy_cur_caches(void)
+{
+ xfs_allocbt_destroy_cur_cache();
+ xfs_inobt_destroy_cur_cache();
+ xfs_bmbt_destroy_cur_cache();
+ xfs_rmapbt_destroy_cur_cache();
+ xfs_refcountbt_destroy_cur_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4eaf8517f850..22d9f411fde6 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -13,8 +13,6 @@ struct xfs_trans;
struct xfs_ifork;
struct xfs_perag;
-extern kmem_zone_t *xfs_btree_cur_zone;
-
/*
* Generic key, ptr and record wrapper structures.
*
@@ -92,8 +90,6 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
-#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
-
struct xfs_btree_ops {
/* size of the key and record structures */
size_t key_len;
@@ -181,18 +177,18 @@ union xfs_btree_irec {
/* Per-AG btree information. */
struct xfs_btree_cur_ag {
- struct xfs_perag *pag;
+ struct xfs_perag *pag;
union {
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
};
union {
struct {
- unsigned long nr_ops; /* # record updates */
- int shape_changes; /* # of extent splits */
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
} refc;
struct {
- bool active; /* allocation cursor state */
+ bool active; /* allocation cursor state */
} abt;
};
};
@@ -212,26 +208,35 @@ struct xfs_btree_cur_ino {
#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
};
+struct xfs_btree_level {
+ /* buffer pointer */
+ struct xfs_buf *bp;
+
+ /* key/record number */
+ uint16_t ptr;
+
+ /* readahead info */
+#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */
+ uint16_t ra;
+};
+
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
*/
-typedef struct xfs_btree_cur
+struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
- uint bc_flags; /* btree features - below */
+ struct kmem_cache *bc_cache; /* cursor cache */
+ unsigned int bc_flags; /* btree features - below */
+ xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
- struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
- int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
- uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
-#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
-#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
- uint8_t bc_nlevels; /* number of levels in the tree */
- uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
- int bc_statoff; /* offset of btre stats array */
+ uint8_t bc_nlevels; /* number of levels in the tree */
+ uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ int bc_statoff; /* offset of btree stats array */
/*
* Short btree pointers need an agno to be able to turn the pointers
@@ -243,7 +248,21 @@ typedef struct xfs_btree_cur
struct xfs_btree_cur_ag bc_ag;
struct xfs_btree_cur_ino bc_ino;
};
-} xfs_btree_cur_t;
+
+ /* Must be at the end of the struct! */
+ struct xfs_btree_level bc_levels[];
+};
+
+/*
+ * Compute the size of a btree cursor that can handle a btree of a given
+ * height. The bc_levels array handles node and leaf blocks, so its size
+ * is exactly nlevels.
+ */
+static inline size_t
+xfs_btree_cur_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels);
+}
/* cursor flags */
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
@@ -258,7 +277,6 @@ typedef struct xfs_btree_cur
*/
#define XFS_BTREE_STAGING (1<<5)
-
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -309,7 +327,7 @@ xfs_btree_check_sptr(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int error); /* del because of error */
/*
@@ -318,8 +336,8 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur);/* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur);/* output cursor */
/*
* Compute first and last byte offsets for the fields given.
@@ -460,8 +478,12 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
-unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
+unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
+ unsigned long long records);
+unsigned long long xfs_btree_calc_size(const unsigned int *limits,
+ unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+ unsigned long long blocks);
/*
* Return codes for the query range iterator function are 0 to continue
@@ -527,7 +549,7 @@ struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
/* Does this cursor point to the last block in the given level? */
static inline bool
xfs_btree_islastblock(
- xfs_btree_cur_t *cur,
+ struct xfs_btree_cur *cur,
int level)
{
struct xfs_btree_block *block;
@@ -558,4 +580,27 @@ void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
const union xfs_btree_key *src_key, int numkeys);
+static inline struct xfs_btree_cur *
+xfs_btree_alloc_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_btnum_t btnum,
+ uint8_t maxlevels,
+ struct kmem_cache *cache)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_maxlevels = maxlevels;
+ cur->bc_cache = cache;
+
+ return cur;
+}
+
+int __init xfs_btree_init_cur_caches(void);
+void xfs_btree_destroy_cur_caches(void);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index ac9e80152b5c..dd75e208b543 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -657,12 +657,12 @@ xfs_btree_bload_compute_geometry(
* checking levels 0 and 1 here, so set bc_nlevels such that the btree
* code doesn't interpret either as the root level.
*/
- cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1;
+ cur->bc_nlevels = cur->bc_maxlevels - 1;
xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
bbl->nr_records = nr_this_level = nr_records;
- for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
+ for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
uint64_t level_blocks;
uint64_t dontcare64;
unsigned int level = cur->bc_nlevels - 1;
@@ -703,6 +703,7 @@ xfs_btree_bload_compute_geometry(
* block-based btree level.
*/
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
xfs_btree_bload_level_geometry(cur, bbl, level,
nr_this_level, &avg_per_block,
&level_blocks, &dontcare64);
@@ -718,13 +719,14 @@ xfs_btree_bload_compute_geometry(
/* Otherwise, we need another level of btree. */
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
}
nr_blocks += level_blocks;
nr_this_level = level_blocks;
}
- if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
+ if (cur->bc_nlevels > cur->bc_maxlevels)
return -EOVERFLOW;
bbl->btree_height = cur->bc_nlevels;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index c062e2c85178..dd7a2dbce1d1 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -72,7 +72,7 @@ STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *save_blk);
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */
/*
* Allocate a dir-state structure.
@@ -84,7 +84,7 @@ xfs_da_state_alloc(
{
struct xfs_da_state *state;
- state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL);
+ state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
state->args = args;
state->mp = args->dp->i_mount;
return state;
@@ -113,7 +113,7 @@ xfs_da_state_free(xfs_da_state_t *state)
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
- kmem_cache_free(xfs_da_state_zone, state);
+ kmem_cache_free(xfs_da_state_cache, state);
}
static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ad5dd324631a..0faf7d9ac241 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -9,7 +9,6 @@
struct xfs_inode;
struct xfs_trans;
-struct zone;
/*
* Directory/attribute geometry information. There will be one of these for each
@@ -227,6 +226,6 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
-extern struct kmem_zone *xfs_da_state_zone;
+extern struct kmem_cache *xfs_da_state_cache;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eff4a127188e..0805ade2d300 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -18,6 +18,12 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+
+static struct kmem_cache *xfs_defer_pending_cache;
/*
* Deferred Operations in XFS
@@ -232,23 +238,20 @@ xfs_defer_trans_abort(
}
}
-/* Roll a transaction so we can do some deferred op processing. */
-STATIC int
-xfs_defer_trans_roll(
- struct xfs_trans **tpp)
+/*
+ * Capture resources that the caller said not to release ("held") when the
+ * transaction commits. Caller is responsible for zero-initializing @dres.
+ */
+static int
+xfs_defer_save_resources(
+ struct xfs_defer_resources *dres,
+ struct xfs_trans *tp)
{
- struct xfs_trans *tp = *tpp;
struct xfs_buf_log_item *bli;
struct xfs_inode_log_item *ili;
struct xfs_log_item *lip;
- struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS];
- struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES];
- unsigned int ordered = 0; /* bitmap */
- int bpcount = 0, ipcount = 0;
- int i;
- int error;
- BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS);
+ BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
list_for_each_entry(lip, &tp->t_items, li_trans) {
switch (lip->li_type) {
@@ -256,28 +259,29 @@ xfs_defer_trans_roll(
bli = container_of(lip, struct xfs_buf_log_item,
bli_item);
if (bli->bli_flags & XFS_BLI_HOLD) {
- if (bpcount >= XFS_DEFER_OPS_NR_BUFS) {
+ if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
ASSERT(0);
return -EFSCORRUPTED;
}
if (bli->bli_flags & XFS_BLI_ORDERED)
- ordered |= (1U << bpcount);
+ dres->dr_ordered |=
+ (1U << dres->dr_bufs);
else
xfs_trans_dirty_buf(tp, bli->bli_buf);
- bplist[bpcount++] = bli->bli_buf;
+ dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
}
break;
case XFS_LI_INODE:
ili = container_of(lip, struct xfs_inode_log_item,
ili_item);
if (ili->ili_lock_flags == 0) {
- if (ipcount >= XFS_DEFER_OPS_NR_INODES) {
+ if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
ASSERT(0);
return -EFSCORRUPTED;
}
xfs_trans_log_inode(tp, ili->ili_inode,
XFS_ILOG_CORE);
- iplist[ipcount++] = ili->ili_inode;
+ dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
}
break;
default:
@@ -285,7 +289,43 @@ xfs_defer_trans_roll(
}
}
- trace_xfs_defer_trans_roll(tp, _RET_IP_);
+ return 0;
+}
+
+/* Attach the held resources to the transaction. */
+static void
+xfs_defer_restore_resources(
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ /* Rejoin the joined inodes. */
+ for (i = 0; i < dres->dr_inos; i++)
+ xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
+
+ /* Rejoin the buffers and dirty them so the log moves forward. */
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_trans_bjoin(tp, dres->dr_bp[i]);
+ if (dres->dr_ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
+ xfs_trans_bhold(tp, dres->dr_bp[i]);
+ }
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tpp)
+{
+ struct xfs_defer_resources dres = { };
+ int error;
+
+ error = xfs_defer_save_resources(&dres, *tpp);
+ if (error)
+ return error;
+
+ trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
/*
* Roll the transaction. Rolling always given a new transaction (even
@@ -295,22 +335,11 @@ xfs_defer_trans_roll(
* happened.
*/
error = xfs_trans_roll(tpp);
- tp = *tpp;
- /* Rejoin the joined inodes. */
- for (i = 0; i < ipcount; i++)
- xfs_trans_ijoin(tp, iplist[i], 0);
-
- /* Rejoin the buffers and dirty them so the log moves forward. */
- for (i = 0; i < bpcount; i++) {
- xfs_trans_bjoin(tp, bplist[i]);
- if (ordered & (1U << i))
- xfs_trans_ordered_buf(tp, bplist[i]);
- xfs_trans_bhold(tp, bplist[i]);
- }
+ xfs_defer_restore_resources(*tpp, &dres);
if (error)
- trace_xfs_defer_trans_roll_error(tp, error);
+ trace_xfs_defer_trans_roll_error(*tpp, error);
return error;
}
@@ -342,7 +371,7 @@ xfs_defer_cancel_list(
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
}
}
@@ -439,7 +468,7 @@ xfs_defer_finish_one(
/* Done with the dfp, free it. */
list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
out:
if (ops->finish_cleanup)
ops->finish_cleanup(tp, state, error);
@@ -573,8 +602,8 @@ xfs_defer_add(
dfp = NULL;
}
if (!dfp) {
- dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
- KM_NOFS);
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
@@ -627,10 +656,11 @@ xfs_defer_move(
*/
static struct xfs_defer_capture *
xfs_defer_ops_capture(
- struct xfs_trans *tp,
- struct xfs_inode *capture_ip)
+ struct xfs_trans *tp)
{
struct xfs_defer_capture *dfc;
+ unsigned short i;
+ int error;
if (list_empty(&tp->t_dfops))
return NULL;
@@ -654,27 +684,48 @@ xfs_defer_ops_capture(
/* Preserve the log reservation size. */
dfc->dfc_logres = tp->t_log_res;
+ error = xfs_defer_save_resources(&dfc->dfc_held, tp);
+ if (error) {
+ /*
+ * Resource capture should never fail, but if it does, we
+ * still have to shut down the log and release things
+ * properly.
+ */
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
/*
- * Grab an extra reference to this inode and attach it to the capture
- * structure.
+ * Grab extra references to the inodes and buffers because callers are
+ * expected to release their held references after we commit the
+ * transaction.
*/
- if (capture_ip) {
- ihold(VFS_I(capture_ip));
- dfc->dfc_capture_ip = capture_ip;
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
+ ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
}
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
+
return dfc;
}
/* Release all resources that we used to capture deferred ops. */
void
-xfs_defer_ops_release(
+xfs_defer_ops_capture_free(
struct xfs_mount *mp,
struct xfs_defer_capture *dfc)
{
+ unsigned short i;
+
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
- if (dfc->dfc_capture_ip)
- xfs_irele(dfc->dfc_capture_ip);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
+
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++)
+ xfs_irele(dfc->dfc_held.dr_ip[i]);
+
kmem_free(dfc);
}
@@ -689,24 +740,21 @@ xfs_defer_ops_release(
int
xfs_defer_ops_capture_and_commit(
struct xfs_trans *tp,
- struct xfs_inode *capture_ip,
struct list_head *capture_list)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_defer_capture *dfc;
int error;
- ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
-
/* If we don't capture anything, commit transaction and exit. */
- dfc = xfs_defer_ops_capture(tp, capture_ip);
+ dfc = xfs_defer_ops_capture(tp);
if (!dfc)
return xfs_trans_commit(tp);
/* Commit the transaction and add the capture structure to the list. */
error = xfs_trans_commit(tp);
if (error) {
- xfs_defer_ops_release(mp, dfc);
+ xfs_defer_ops_capture_free(mp, dfc);
return error;
}
@@ -724,17 +772,19 @@ void
xfs_defer_ops_continue(
struct xfs_defer_capture *dfc,
struct xfs_trans *tp,
- struct xfs_inode **captured_ipp)
+ struct xfs_defer_resources *dres)
{
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
/* Lock and join the captured inode to the new transaction. */
- if (dfc->dfc_capture_ip) {
- xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
- }
- *captured_ipp = dfc->dfc_capture_ip;
+ if (dfc->dfc_held.dr_inos == 2)
+ xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
+ dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
+ else if (dfc->dfc_held.dr_inos == 1)
+ xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+ xfs_defer_restore_resources(tp, &dfc->dfc_held);
+ memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -742,3 +792,82 @@ xfs_defer_ops_continue(
kmem_free(dfc);
}
+
+/* Release the resources captured and continued during recovery. */
+void
+xfs_defer_resources_rele(
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ for (i = 0; i < dres->dr_inos; i++) {
+ xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
+ xfs_irele(dres->dr_ip[i]);
+ dres->dr_ip[i] = NULL;
+ }
+
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_buf_relse(dres->dr_bp[i]);
+ dres->dr_bp[i] = NULL;
+ }
+
+ dres->dr_inos = 0;
+ dres->dr_bufs = 0;
+ dres->dr_ordered = 0;
+}
+
+static inline int __init
+xfs_defer_init_cache(void)
+{
+ xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
+ sizeof(struct xfs_defer_pending),
+ 0, 0, NULL);
+
+ return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
+}
+
+static inline void
+xfs_defer_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_defer_pending_cache);
+ xfs_defer_pending_cache = NULL;
+}
+
+/* Set up caches for deferred work items. */
+int __init
+xfs_defer_init_item_caches(void)
+{
+ int error;
+
+ error = xfs_defer_init_cache();
+ if (error)
+ return error;
+ error = xfs_rmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_refcount_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_bmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_extfree_intent_init_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_defer_destroy_item_caches();
+ return error;
+}
+
+/* Destroy all the deferred work item caches, if they've been allocated. */
+void
+xfs_defer_destroy_item_caches(void)
+{
+ xfs_extfree_intent_destroy_cache();
+ xfs_bmap_intent_destroy_cache();
+ xfs_refcount_intent_destroy_cache();
+ xfs_rmap_intent_destroy_cache();
+ xfs_defer_destroy_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 05472f71fffe..7bb8a31ad65b 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -65,6 +65,30 @@ extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
/*
+ * Deferred operation item relogging limits.
+ */
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
+
+/* Resources that must be held across a transaction roll. */
+struct xfs_defer_resources {
+ /* held buffers */
+ struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS];
+
+ /* inodes with no unlock flags */
+ struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES];
+
+ /* number of held buffers */
+ unsigned short dr_bufs;
+
+ /* bitmap of ordered buffers */
+ unsigned short dr_ordered;
+
+ /* number of held inodes */
+ unsigned short dr_inos;
+};
+
+/*
* This structure enables a dfops user to detach the chain of deferred
* operations from a transaction so that they can be continued later.
*/
@@ -83,11 +107,7 @@ struct xfs_defer_capture {
/* Log reservation saved from the transaction. */
unsigned int dfc_logres;
- /*
- * An inode reference that must be maintained to complete the deferred
- * work.
- */
- struct xfs_inode *dfc_capture_ip;
+ struct xfs_defer_resources dfc_held;
};
/*
@@ -95,9 +115,14 @@ struct xfs_defer_capture {
* This doesn't normally happen except log recovery.
*/
int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
- struct xfs_inode *capture_ip, struct list_head *capture_list);
+ struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
- struct xfs_inode **captured_ipp);
-void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+ struct xfs_defer_resources *dres);
+void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+ struct xfs_defer_capture *d);
+void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+
+int __init xfs_defer_init_item_caches(void);
+void xfs_defer_destroy_item_caches(void);
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index deeb74becabc..15a362e2f5ea 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,7 +22,7 @@ xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
ASSERT(nbblks > 0);
- return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
+ return BBTOB(nbblks) / sizeof(struct xfs_dqblk);
}
/*
@@ -127,7 +127,7 @@ xfs_dqblk_repair(
* Typically, a repair is only requested by quotacheck.
*/
ASSERT(id != -1);
- memset(dqb, 0, sizeof(xfs_dqblk_t));
+ memset(dqb, 0, sizeof(struct xfs_dqblk));
dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 2d7057b7984b..d665c04e69dd 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -184,7 +184,7 @@ typedef struct xfs_sb {
* Superblock - on disk version. Must match the in core version above.
* Must be padded to 64 bit alignment.
*/
-typedef struct xfs_dsb {
+struct xfs_dsb {
__be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
__be32 sb_blocksize; /* logical block size, bytes */
__be64 sb_dblocks; /* number of data blocks */
@@ -263,7 +263,7 @@ typedef struct xfs_dsb {
uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
-} xfs_dsb_t;
+};
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
@@ -780,7 +780,7 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
* padding field for v3 inodes.
*/
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-typedef struct xfs_dinode {
+struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
@@ -825,7 +825,7 @@ typedef struct xfs_dinode {
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
+};
#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
@@ -1215,7 +1215,7 @@ struct xfs_disk_dquot {
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
-typedef struct xfs_dqblk {
+struct xfs_dqblk {
struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
char dd_fill[4];/* filling for posterity */
@@ -1225,7 +1225,7 @@ typedef struct xfs_dqblk {
__be32 dd_crc; /* checksum */
__be64 dd_lsn; /* last modification in log */
uuid_t dd_uuid; /* location information */
-} xfs_dqblk_t;
+};
#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index bde2b4c64dbe..c43877c8a279 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -268,6 +268,8 @@ typedef struct xfs_fsop_resblks {
*/
#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */
#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */
+#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
/* keep the maximum size under 2^31 by a small amount */
#define XFS_MAX_LOG_BYTES \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 994ad783d407..b418fe0c0679 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1827,7 +1827,7 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks,
&XFS_RMAP_OINFO_INODES);
return;
@@ -1872,7 +1872,7 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
contigblk, &XFS_RMAP_OINFO_INODES);
/* reset range to current bit and carry on... */
@@ -2793,6 +2793,7 @@ xfs_ialloc_setup_geometry(
inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
inodes);
+ ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
/*
* Set the maximum inode count for this filesystem, being careful not
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 27190840c5d8..b2ad2fdc40f5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -22,6 +22,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_inobt_cur_cache;
+
STATIC int
xfs_inobt_get_minrecs(
struct xfs_btree_cur *cur,
@@ -432,10 +434,8 @@ xfs_inobt_init_common(
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
if (btnum == XFS_BTNUM_INO) {
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
cur->bc_ops = &xfs_inobt_ops;
@@ -444,8 +444,6 @@ xfs_inobt_init_common(
cur->bc_ops = &xfs_finobt_ops;
}
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -530,6 +528,17 @@ xfs_inobt_commit_staged_btree(
}
}
+/* Calculate number of records in an inode btree block. */
+static inline unsigned int
+xfs_inobt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
+
/*
* Calculate number of records in an inobt btree block.
*/
@@ -540,10 +549,54 @@ xfs_inobt_maxrecs(
int leaf)
{
blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+ return xfs_inobt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_inobt_rec_t);
- return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+/*
+ * Maximum number of inode btree records per AG. Pretend that we can fill an
+ * entire AG completely full of inodes except for the AG headers.
+ */
+#define XFS_MAX_INODE_RECORDS \
+ ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \
+ XFS_INODES_PER_CHUNK
+
+/* Compute the max possible height for the inode btree. */
+static inline unsigned int
+xfs_inobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for the free inode btree. */
+static inline unsigned int
+xfs_finobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for either inode btree. */
+unsigned int
+xfs_iallocbt_maxlevels_ondisk(void)
+{
+ return max(xfs_inobt_maxlevels_ondisk(),
+ xfs_finobt_maxlevels_ondisk());
}
/*
@@ -761,3 +814,22 @@ xfs_iallocbt_calc_size(
{
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
}
+
+int __init
+xfs_inobt_init_cur_cache(void)
+{
+ xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur",
+ xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_inobt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_inobt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_inobt_cur_cache);
+ xfs_inobt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 8a322d402e61..26451cb76b98 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -75,4 +75,9 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_iallocbt_maxlevels_ondisk(void);
+
+int __init xfs_inobt_init_cur_cache(void);
+void xfs_inobt_destroy_cur_cache(void);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 3932b4ebf903..cae9708c8587 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -51,9 +51,9 @@ xfs_inode_buf_verify(
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
- xfs_agino_t unlinked_ino;
+ struct xfs_dinode *dip;
+ xfs_agino_t unlinked_ino;
+ int di_ok;
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1d174909f9bd..9149f4f796fc 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -26,7 +26,7 @@
#include "xfs_types.h"
#include "xfs_errortag.h"
-kmem_zone_t *xfs_ifork_zone;
+struct kmem_cache *xfs_ifork_cache;
void
xfs_init_local_fork(
@@ -67,10 +67,10 @@ xfs_init_local_fork(
*/
STATIC int
xfs_iformat_local(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork,
- int size)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork,
+ int size)
{
/*
* If the size is unreasonable, then something
@@ -162,8 +162,8 @@ xfs_iformat_extents(
*/
STATIC int
xfs_iformat_btree(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
@@ -284,7 +284,7 @@ xfs_ifork_alloc(
{
struct xfs_ifork *ifp;
- ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL);
ifp->if_format = format;
ifp->if_nextents = nextents;
return ifp;
@@ -325,7 +325,7 @@ xfs_iformat_attr_fork(
}
if (error) {
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
ip->i_afp = NULL;
}
return error;
@@ -580,8 +580,8 @@ xfs_iextents_copy(
*/
void
xfs_iflush_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
struct xfs_inode_log_item *iip,
int whichfork)
{
@@ -676,7 +676,7 @@ xfs_ifork_init_cow(
if (ip->i_cowfp)
return;
- ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone,
+ ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
GFP_NOFS | __GFP_NOFAIL);
ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index a6f7897b6887..3d64a3acb0ed 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -221,7 +221,7 @@ static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
xfs_iext_get_extent((ifp), (ext), (got)); \
xfs_iext_next((ifp), (ext)))
-extern struct kmem_zone *xfs_ifork_zone;
+extern struct kmem_cache *xfs_ifork_cache;
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index e5d767a7fc5d..327ba25e9e17 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -24,6 +24,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+struct kmem_cache *xfs_refcount_intent_cache;
+
/* Allowable refcount adjustment amounts. */
enum xfs_refc_adjust_op {
XFS_REFCOUNT_ADJUST_INCREASE = 1,
@@ -916,8 +918,7 @@ xfs_refcount_adjust_extents(
struct xfs_btree_cur *cur,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
struct xfs_refcount_irec ext, tmp;
int error;
@@ -974,8 +975,8 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno,
- tmp.rc_blockcount, oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ tmp.rc_blockcount, NULL);
}
(*agbno) += tmp.rc_blockcount;
@@ -1019,8 +1020,8 @@ xfs_refcount_adjust_extents(
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
- oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ ext.rc_blockcount, NULL);
}
skip:
@@ -1048,8 +1049,7 @@ xfs_refcount_adjust(
xfs_extlen_t aglen,
xfs_agblock_t *new_agbno,
xfs_extlen_t *new_aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int shape_changes = 0;
@@ -1092,8 +1092,7 @@ xfs_refcount_adjust(
cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
- adj, oinfo);
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
if (error)
goto out_error;
@@ -1188,12 +1187,12 @@ xfs_refcount_finish_one(
switch (type) {
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL);
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE);
*new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL);
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE);
*new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_ALLOC_COW:
@@ -1235,8 +1234,8 @@ __xfs_refcount_add(
type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
blockcount);
- ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
- KM_NOFS);
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
@@ -1742,7 +1741,7 @@ xfs_refcount_recover_cow_leftovers(
rr->rr_rrec.rc_blockcount);
/* Free the block. */
- xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+ xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
error = xfs_trans_commit(tp);
if (error)
@@ -1782,3 +1781,20 @@ xfs_refcount_has_record(
return xfs_btree_has_record(cur, &low, &high, exists);
}
+
+int __init
+xfs_refcount_intent_init_cache(void)
+{
+ xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent",
+ sizeof(struct xfs_refcount_intent),
+ 0, 0, NULL);
+
+ return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_refcount_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_refcount_intent_cache);
+ xfs_refcount_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 02cb3aa405be..9eb01edbd89d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -32,8 +32,8 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
enum xfs_refcount_intent_type ri_type;
- xfs_fsblock_t ri_startblock;
xfs_extlen_t ri_blockcount;
+ xfs_fsblock_t ri_startblock;
};
void xfs_refcount_increase_extent(struct xfs_trans *tp,
@@ -83,4 +83,9 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+extern struct kmem_cache *xfs_refcount_intent_cache;
+
+int __init xfs_refcount_intent_init_cache(void);
+void xfs_refcount_intent_destroy_cache(void);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 1ef9b99962ab..d14c1720b0fb 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -21,6 +21,8 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
+static struct kmem_cache *xfs_refcountbt_cur_cache;
+
static struct xfs_btree_cur *
xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
@@ -322,11 +324,8 @@ xfs_refcountbt_init_common(
ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = XFS_BTNUM_REFC;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -396,6 +395,18 @@ xfs_refcountbt_commit_staged_btree(
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
}
+/* Calculate number of records in a refcount btree block. */
+static inline unsigned int
+xfs_refcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
/*
* Calculate the number of records in a refcount btree block.
*/
@@ -405,11 +416,22 @@ xfs_refcountbt_maxrecs(
bool leaf)
{
blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+ return xfs_refcountbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_refcount_rec);
- return blocklen / (sizeof(struct xfs_refcount_key) +
- sizeof(xfs_refcount_ptr_t));
+/* Compute the max possible height of the maximally sized refcount btree. */
+unsigned int
+xfs_refcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of a refcount btree. */
@@ -417,8 +439,14 @@ void
xfs_refcountbt_compute_maxlevels(
struct xfs_mount *mp)
{
+ if (!xfs_has_reflink(mp)) {
+ mp->m_refc_maxlevels = 0;
+ return;
+ }
+
mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+ ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -488,3 +516,22 @@ xfs_refcountbt_calc_reserves(
return error;
}
+
+int __init
+xfs_refcountbt_init_cur_cache(void)
+{
+ xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur",
+ xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_refcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_refcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_refcountbt_cur_cache);
+ xfs_refcountbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index bd9ed9e1e41f..d66b37259bed 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -65,4 +65,9 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
+unsigned int xfs_refcountbt_maxlevels_ondisk(void);
+
+int __init xfs_refcountbt_init_cur_cache(void);
+void xfs_refcountbt_destroy_cur_cache(void);
+
#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index f45929b1b94a..cd322174dbff 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -24,6 +24,8 @@
#include "xfs_inode.h"
#include "xfs_ag.h"
+struct kmem_cache *xfs_rmap_intent_cache;
+
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
* in the btree given by cur.
@@ -2485,7 +2487,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2779,3 +2781,20 @@ const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
.oi_owner = XFS_RMAP_OWN_COW,
};
+
+int __init
+xfs_rmap_intent_init_cache(void)
+{
+ xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent",
+ sizeof(struct xfs_rmap_intent),
+ 0, 0, NULL);
+
+ return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_rmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_rmap_intent_cache);
+ xfs_rmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index fd67904ed446..b718ebeda372 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -159,8 +159,8 @@ enum xfs_rmap_intent_type {
struct xfs_rmap_intent {
struct list_head ri_list;
enum xfs_rmap_intent_type ri_type;
- uint64_t ri_owner;
int ri_whichfork;
+ uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
};
@@ -215,4 +215,9 @@ extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+extern struct kmem_cache *xfs_rmap_intent_cache;
+
+int __init xfs_rmap_intent_init_cache(void);
+void xfs_rmap_intent_destroy_cache(void);
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index b7dbbfb3aeed..69e104d0277f 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+static struct kmem_cache *xfs_rmapbt_cur_cache;
+
/*
* Reverse map btree.
*
@@ -451,13 +453,10 @@ xfs_rmapbt_init_common(
{
struct xfs_btree_cur *cur;
- cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
/* Overlapping btree; 2 keys per pointer. */
- cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_ops = &xfs_rmapbt_ops;
@@ -522,6 +521,18 @@ xfs_rmapbt_commit_staged_btree(
xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
}
+/* Calculate number of records in a reverse mapping btree block. */
+static inline unsigned int
+xfs_rmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
/*
* Calculate number of records in an rmap btree block.
*/
@@ -531,11 +542,33 @@ xfs_rmapbt_maxrecs(
int leaf)
{
blocklen -= XFS_RMAP_BLOCK_LEN;
+ return xfs_rmapbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_rmap_rec);
- return blocklen /
- (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+/* Compute the max possible height for reverse mapping btrees. */
+unsigned int
+xfs_rmapbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. However, we're likely to run
+ * out of blocks in the AG long before that happens, which means that
+ * we must compute the max height based on what the btree will look
+ * like if it consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of an rmap btree. */
@@ -543,26 +576,36 @@ void
xfs_rmapbt_compute_maxlevels(
struct xfs_mount *mp)
{
- /*
- * On a non-reflink filesystem, the maximum number of rmap
- * records is the number of blocks in the AG, hence the max
- * rmapbt height is log_$maxrecs($agblocks). However, with
- * reflink each AG block can have up to 2^32 (per the refcount
- * record format) owners, which means that theoretically we
- * could face up to 2^64 rmap records.
- *
- * That effectively means that the max rmapbt height must be
- * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
- * blocks to feed the rmapbt long before the rmapbt reaches
- * maximum height. The reflink code uses ag_resv_critical to
- * disallow reflinking when less than 10% of the per-AG metadata
- * block reservation since the fallback is a regular file copy.
- */
- if (xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
- else
+ if (!xfs_has_rmapbt(mp)) {
+ mp->m_rmap_maxlevels = 0;
+ return;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * Compute the asymptotic maxlevels for an rmap btree on a
+ * filesystem that supports reflink.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32
+ * (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records.
+ * However, we're likely to run out of blocks in the AG long
+ * before that happens, which means that we must compute the
+ * max height based on what the btree will look like if it
+ * consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+ mp->m_sb.sb_agblocks);
+ } else {
+ /*
+ * If there's no block sharing, compute the maximum rmapbt
+ * height assuming one rmap record per AG block.
+ */
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ }
+ ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -633,3 +676,22 @@ xfs_rmapbt_calc_reserves(
return error;
}
+
+int __init
+xfs_rmapbt_init_cur_cache(void)
+{
+ xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rmapbt_cur_cache);
+ xfs_rmapbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index f2eee6572af4..3244715dd111 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -59,4 +59,9 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+unsigned int xfs_rmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rmapbt_init_cur_cache(void);
+void xfs_rmapbt_destroy_cur_cache(void);
+
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index e58349be78bd..f4e84aa1d50a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -495,7 +495,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
static void
__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from,
+ struct xfs_dsb *from,
bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
@@ -571,7 +571,7 @@ __xfs_sb_from_disk(
void
xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ struct xfs_dsb *from)
{
__xfs_sb_from_disk(to, from, true);
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5e300daa2559..6f83d9b306ee 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -70,7 +70,7 @@ xfs_allocfree_log_count(
{
uint blocks;
- blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
+ blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
if (xfs_has_reflink(mp))
@@ -814,6 +814,19 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
@@ -916,4 +929,7 @@ xfs_trans_resv_calc(
resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 50332be34388..87b31c69a773 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -17,6 +17,13 @@
/* Adding one rmap could split every level up to the top of the tree. */
#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9
+
/* Blocks we might need to add "b" rmaps to a tree. */
#define XFS_NRMAPADD_SPACE_RES(mp, b)\
(((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
@@ -74,7 +81,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * (mp)->m_ag_maxlevels)
+ (2 * (mp)->m_alloc_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index ae3c9f6e2c69..bed798792226 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -555,11 +555,11 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (xfs_has_rmapbt(mp)) {
@@ -568,7 +568,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -578,7 +578,7 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_refcount_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_refc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -850,6 +850,7 @@ xchk_agi(
struct xfs_mount *mp = sc->mp;
struct xfs_agi *agi;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(sc->mp);
xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -880,7 +881,7 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
if (xfs_has_finobt(mp)) {
@@ -889,7 +890,7 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_free_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 0f8deee66f15..d7bfed52f4cd 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -122,7 +122,7 @@ xrep_check_btree_root(
xfs_agnumber_t agno = sc->sm->sm_agno;
return xfs_verify_agbno(mp, agno, fab->root) &&
- fab->height <= XFS_BTREE_MAXLEVELS;
+ fab->height <= fab->maxlevels;
}
/*
@@ -339,18 +339,22 @@ xrep_agf(
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_bnobt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_cntbt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
+ .maxlevels = sc->mp->m_rmap_maxlevels,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
+ .maxlevels = sc->mp->m_refc_maxlevels,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -881,10 +885,12 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index d6d24c866bc4..b89bf9de9b1c 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -222,21 +222,21 @@ out:
* 1 2 3
*
* Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_ptrs[0] == 1, so we record
- * that we saw block 1. Then we observe that bc_ptrs[1] == 1, so we record
- * block 4. The list is [1, 4].
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
*
- * For the second btree record, we see that bc_ptrs[0] == 2, so we exit the
- * loop. The list remains [1, 4].
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
*
* For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_ptrs[0] == 1 again, so we record that we saw block 2. We see that
- * bc_ptrs[1] == 2, so we exit the loop. The list is now [1, 4, 2].
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
*
- * For the 102nd record, bc_ptrs[0] == 2, so we continue.
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
*
- * For the 201st record, we've moved on to leaf block 3. bc_ptrs[0] == 1, so
- * we add 3 to the list. Now it is [1, 4, 2, 3].
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
*
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
*/
@@ -256,7 +256,7 @@ xbitmap_set_btcur_path(
int i;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
xfs_btree_get_block(cur, i, &bp);
if (!bp)
continue;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 017da9ceaee9..a4cbbc346f60 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -402,7 +402,7 @@ xchk_bmapbt_rec(
* the root since the verifiers don't do that.
*/
if (xfs_has_crc(bs->cur->bc_mp) &&
- bs->cur->bc_ptrs[0] == 1) {
+ bs->cur->bc_levels[0].ptr == 1) {
for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
block = xfs_btree_get_block(bs->cur, i, &bp);
owner = be64_to_cpu(block->bb_u.l.bb_owner);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index eccb855dc904..39dd46f038fe 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -136,14 +136,14 @@ xchk_btree_rec(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, 0, &bp);
- rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
trace_xchk_btree_rec(bs->sc, cur, 0);
/* If this isn't the first record, are they in order? */
- if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ if (cur->bc_levels[0].ptr > 1 &&
+ !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
xchk_btree_set_corrupt(bs->sc, cur, 0);
- bs->firstrec = false;
memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
if (cur->bc_nlevels == 1)
@@ -152,7 +152,7 @@ xchk_btree_rec(
/* Is this at least as large as the parent low key? */
cur->bc_ops->init_key_from_rec(&key, rec);
keyblock = xfs_btree_get_block(cur, 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
@@ -161,7 +161,7 @@ xchk_btree_rec(
/* Is this no larger than the parent high key? */
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
}
@@ -183,23 +183,22 @@ xchk_btree_key(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+ key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
trace_xchk_btree_key(bs->sc, cur, level);
/* If this isn't the first key, are they in order? */
- if (!bs->firstkey[level] &&
- !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ if (cur->bc_levels[level].ptr > 1 &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
xchk_btree_set_corrupt(bs->sc, cur, level);
- bs->firstkey[level] = false;
- memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+ memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
if (level + 1 >= cur->bc_nlevels)
return;
/* Is this at least as large as the parent low key? */
keyblock = xfs_btree_get_block(cur, level + 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
@@ -207,8 +206,9 @@ xchk_btree_key(
return;
/* Is this no larger than the parent high key? */
- key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
+ keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
}
@@ -291,7 +291,7 @@ xchk_btree_block_check_sibling(
/* Compare upper level pointer to sibling pointer. */
pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
- pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
if (!xchk_btree_ptr_ok(bs, level + 1, pp))
goto out;
if (pbp)
@@ -596,7 +596,7 @@ xchk_btree_block_keys(
/* Obtain the parent's copy of the keys for this block. */
parent_block = xfs_btree_get_block(cur, level + 1, &bp);
- parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
@@ -607,7 +607,7 @@ xchk_btree_block_keys(
/* Get high keys */
high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
- high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
@@ -627,35 +627,39 @@ xchk_btree(
const struct xfs_owner_info *oinfo,
void *private)
{
- struct xchk_btree bs = {
- .cur = cur,
- .scrub_rec = scrub_fn,
- .oinfo = oinfo,
- .firstrec = true,
- .private = private,
- .sc = sc,
- };
union xfs_btree_ptr ptr;
+ struct xchk_btree *bs;
union xfs_btree_ptr *pp;
union xfs_btree_rec *recp;
struct xfs_btree_block *block;
- int level;
struct xfs_buf *bp;
struct check_owner *co;
struct check_owner *n;
- int i;
+ size_t cur_sz;
+ int level;
int error = 0;
- /* Initialize scrub state */
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
- bs.firstkey[i] = true;
- INIT_LIST_HEAD(&bs.to_check);
-
- /* Don't try to check a tree with a height we can't handle. */
- if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ /*
+ * Allocate the btree scrub context from the heap, because this
+ * structure can get rather large. Don't let a caller feed us a
+ * totally absurd size.
+ */
+ cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
+ if (cur_sz > PAGE_SIZE) {
xchk_btree_set_corrupt(sc, cur, 0);
- goto out;
+ return 0;
}
+ bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ if (!bs)
+ return -ENOMEM;
+ bs->cur = cur;
+ bs->scrub_rec = scrub_fn;
+ bs->oinfo = oinfo;
+ bs->private = private;
+ bs->sc = sc;
+
+ /* Initialize scrub state */
+ INIT_LIST_HEAD(&bs->to_check);
/*
* Load the root of the btree. The helper function absorbs
@@ -663,79 +667,82 @@ xchk_btree(
*/
level = cur->bc_nlevels - 1;
cur->bc_ops->init_ptr_from_cur(cur, &ptr);
- if (!xchk_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
- error = xchk_btree_get_block(&bs, level, &ptr, &block, &bp);
+ error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
if (level == 0) {
/* End of leaf, pop back towards the root. */
- if (cur->bc_ptrs[level] >
+ if (cur->bc_levels[level].ptr >
be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Records in order for scrub? */
- xchk_btree_rec(&bs);
+ xchk_btree_rec(bs);
/* Call out to the record checker. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
- error = bs.scrub_rec(&bs, recp);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
+ error = bs->scrub_rec(bs, recp);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Keys in order for scrub? */
- xchk_btree_key(&bs, level);
+ xchk_btree_key(bs, level);
/* Drill another level deeper. */
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
- if (!xchk_btree_ptr_ok(&bs, level, pp)) {
- cur->bc_ptrs[level]++;
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+ if (!xchk_btree_ptr_ok(bs, level, pp)) {
+ cur->bc_levels[level].ptr++;
continue;
}
level--;
- error = xchk_btree_get_block(&bs, level, pp, &block, &bp);
+ error = xchk_btree_get_block(bs, level, pp, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
}
out:
/* Process deferred owner checks on btree blocks. */
- list_for_each_entry_safe(co, n, &bs.to_check, list) {
- if (!error && bs.cur)
- error = xchk_btree_check_block_owner(&bs,
- co->level, co->daddr);
+ list_for_each_entry_safe(co, n, &bs->to_check, list) {
+ if (!error && bs->cur)
+ error = xchk_btree_check_block_owner(bs, co->level,
+ co->daddr);
list_del(&co->list);
kmem_free(co);
}
+ kmem_free(bs);
return error;
}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index b7d2fc01fbf9..da61a53a0b61 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -39,11 +39,22 @@ struct xchk_btree {
/* internal scrub state */
union xfs_btree_rec lastrec;
- bool firstrec;
- union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
- bool firstkey[XFS_BTREE_MAXLEVELS];
struct list_head to_check;
+
+ /* this element must come last! */
+ union xfs_btree_key lastkey[];
};
+
+/*
+ * Calculate the size of a xchk_btree structure. There are nlevels-1 slots for
+ * keys because we track leaf records separately in lastrec.
+ */
+static inline size_t
+xchk_btree_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1);
+}
+
int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
void *private);
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 8a52514bc1ff..b962cfbbd92b 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -473,7 +473,7 @@ xchk_da_btree(
xchk_da_btree_rec_fn scrub_fn,
void *private)
{
- struct xchk_da_btree ds = {};
+ struct xchk_da_btree *ds;
struct xfs_mount *mp = sc->mp;
struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *key;
@@ -486,32 +486,35 @@ xchk_da_btree(
return 0;
/* Set up initial da state. */
- ds.dargs.dp = sc->ip;
- ds.dargs.whichfork = whichfork;
- ds.dargs.trans = sc->tp;
- ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
- ds.state = xfs_da_state_alloc(&ds.dargs);
- ds.sc = sc;
- ds.private = private;
+ ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ if (!ds)
+ return -ENOMEM;
+ ds->dargs.dp = sc->ip;
+ ds->dargs.whichfork = whichfork;
+ ds->dargs.trans = sc->tp;
+ ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->state = xfs_da_state_alloc(&ds->dargs);
+ ds->sc = sc;
+ ds->private = private;
if (whichfork == XFS_ATTR_FORK) {
- ds.dargs.geo = mp->m_attr_geo;
- ds.lowest = 0;
- ds.highest = 0;
+ ds->dargs.geo = mp->m_attr_geo;
+ ds->lowest = 0;
+ ds->highest = 0;
} else {
- ds.dargs.geo = mp->m_dir_geo;
- ds.lowest = ds.dargs.geo->leafblk;
- ds.highest = ds.dargs.geo->freeblk;
+ ds->dargs.geo = mp->m_dir_geo;
+ ds->lowest = ds->dargs.geo->leafblk;
+ ds->highest = ds->dargs.geo->freeblk;
}
- blkno = ds.lowest;
+ blkno = ds->lowest;
level = 0;
/* Find the root of the da tree, if present. */
- blks = ds.state->path.blk;
- error = xchk_da_btree_block(&ds, level, blkno);
+ blks = ds->state->path.blk;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out_state;
/*
- * We didn't find a block at ds.lowest, which means that there's
+ * We didn't find a block at ds->lowest, which means that there's
* no LEAF1/LEAFN tree (at least not where it's supposed to be),
* so jump out now.
*/
@@ -523,16 +526,16 @@ xchk_da_btree(
/* Handle leaf block. */
if (blks[level].magic != XFS_DA_NODE_MAGIC) {
/* End of leaf, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Dispatch record scrubbing. */
- error = scrub_fn(&ds, level);
+ error = scrub_fn(ds, level);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
@@ -545,17 +548,17 @@ xchk_da_btree(
/* End of node, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Hashes in order for scrub? */
- key = xchk_da_btree_node_entry(&ds, level);
- error = xchk_da_btree_hash(&ds, level, &key->hashval);
+ key = xchk_da_btree_node_entry(ds, level);
+ error = xchk_da_btree_hash(ds, level, &key->hashval);
if (error)
goto out;
@@ -564,11 +567,11 @@ xchk_da_btree(
level++;
if (level >= XFS_DA_NODE_MAXDEPTH) {
/* Too deep! */
- xchk_da_set_corrupt(&ds, level - 1);
+ xchk_da_set_corrupt(ds, level - 1);
break;
}
- ds.tree_level--;
- error = xchk_da_btree_block(&ds, level, blkno);
+ ds->tree_level--;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out;
if (blks[level].bp == NULL)
@@ -587,6 +590,7 @@ out:
}
out_state:
- xfs_da_state_free(ds.state);
+ xfs_da_state_free(ds->state);
+ kmem_free(ds);
return error;
}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 3bb152d52a07..840f74ec431c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -44,6 +44,9 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
+ /* in: maximum btree height */
+ unsigned int maxlevels;
+
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 51e4c61916d2..8d528d35b725 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -461,15 +461,10 @@ xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
{
- struct xfs_scrub sc = {
- .file = file,
- .sm = sm,
- };
+ struct xfs_scrub *sc;
struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
int error = 0;
- sc.mp = mp;
-
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
(sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
@@ -489,59 +484,68 @@ xfs_scrub_metadata(
xchk_experimental_warning(mp);
- sc.ops = &meta_scrub_ops[sm->sm_type];
- sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ if (!sc) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ sc->mp = mp;
+ sc->file = file;
+ sc->sm = sm;
+ sc->ops = &meta_scrub_ops[sm->sm_type];
+ sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
/*
* When repairs are allowed, prevent freezing or readonly remount while
* scrub is running with a real transaction.
*/
if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
- error = mnt_want_write_file(sc.file);
+ error = mnt_want_write_file(sc->file);
if (error)
- goto out;
+ goto out_sc;
}
/* Set up for the operation. */
- error = sc.ops->setup(&sc);
+ error = sc->ops->setup(sc);
if (error)
goto out_teardown;
/* Scrub for errors. */
- error = sc.ops->scrub(&sc);
- if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
+ error = sc->ops->scrub(sc);
+ if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
* Tear down everything we hold, then set up again with
* preparation for worst-case scenarios.
*/
- error = xchk_teardown(&sc, 0);
+ error = xchk_teardown(sc, 0);
if (error)
- goto out;
- sc.flags |= XCHK_TRY_HARDER;
+ goto out_sc;
+ sc->flags |= XCHK_TRY_HARDER;
goto retry_op;
} else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
goto out_teardown;
- xchk_update_health(&sc);
+ xchk_update_health(sc);
- if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc.flags & XREP_ALREADY_FIXED)) {
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED)) {
bool needs_fix;
/* Let debug users force us into the repair routines. */
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
+ needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN));
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
if (!needs_fix) {
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
@@ -549,26 +553,28 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(&sc);
+ error = xrep_attempt(sc);
if (error == -EAGAIN) {
/*
* Either the repair function succeeded or it couldn't
* get all the resources it needs; either way, we go
* back to the beginning and call the scrub function.
*/
- error = xchk_teardown(&sc, 0);
+ error = xchk_teardown(sc, 0);
if (error) {
xrep_failure(mp);
- goto out;
+ goto out_sc;
}
goto retry_op;
}
}
out_nofix:
- xchk_postmortem(&sc);
+ xchk_postmortem(sc);
out_teardown:
- error = xchk_teardown(&sc, error);
+ error = xchk_teardown(sc, error);
+out_sc:
+ kmem_free(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index c0ef53fe6611..b5f94676c37c 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -21,13 +21,14 @@ xchk_btree_cur_fsbno(
struct xfs_btree_cur *cur,
int level)
{
- if (level < cur->bc_nlevels && cur->bc_bufs[level])
+ if (level < cur->bc_nlevels && cur->bc_levels[level].bp)
return XFS_DADDR_TO_FSB(cur->bc_mp,
- xfs_buf_daddr(cur->bc_bufs[level]));
- if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ xfs_buf_daddr(cur->bc_levels[level].bp));
+
+ if (level == cur->bc_nlevels - 1 &&
+ (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
- if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0);
+
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index a7bbb84f91a7..93ece6df02e3 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -348,7 +348,7 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->error = error;
__entry->ret_ip = ret_ip;
),
@@ -389,7 +389,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->error = error;
@@ -431,7 +431,7 @@ TRACE_EVENT(xchk_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
@@ -471,7 +471,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
@@ -511,7 +511,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
),
TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 34fc6148032a..c8c15c3c3147 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -82,6 +82,7 @@ xfs_end_ioend(
struct iomap_ioend *ioend)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -97,18 +98,26 @@ xfs_end_ioend(
/*
* Just clean up the in-memory structures if the fs has been shut down.
*/
- if (xfs_is_shutdown(ip->i_mount)) {
+ if (xfs_is_shutdown(mp)) {
error = -EIO;
goto done;
}
/*
- * Clean up any COW blocks on an I/O error.
+ * Clean up all COW blocks and underlying data fork delalloc blocks on
+ * I/O error. The delalloc punch is required because this ioend was
+ * mapped to blocks in the COW fork and the associated pages are no
+ * longer dirty. If we don't remove delalloc blocks here, they become
+ * stale and can corrupt free space accounting on unmount.
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
+ xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, offset),
+ XFS_B_TO_FSB(mp, size));
+ }
goto done;
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 2b5da6218977..27265771f247 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -390,7 +390,7 @@ out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
if (dp->i_afp) {
xfs_idestroy_fork(dp->i_afp);
- kmem_cache_free(xfs_ifork_zone, dp->i_afp);
+ kmem_cache_free(xfs_ifork_cache, dp->i_afp);
dp->i_afp = NULL;
}
if (lock_mode)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 03159970133f..e1f4d7d5a011 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,8 +25,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_bui_zone;
-kmem_zone_t *xfs_bud_zone;
+struct kmem_cache *xfs_bui_cache;
+struct kmem_cache *xfs_bud_cache;
static const struct xfs_item_ops xfs_bui_item_ops;
@@ -39,7 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_cache_free(xfs_bui_zone, buip);
+ kmem_cache_free(xfs_bui_cache, buip);
}
/*
@@ -138,7 +138,7 @@ xfs_bui_init(
{
struct xfs_bui_log_item *buip;
- buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL);
+ buip = kmem_cache_zalloc(xfs_bui_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -198,7 +198,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_cache_free(xfs_bud_zone, budp);
+ kmem_cache_free(xfs_bud_cache, budp);
}
static const struct xfs_item_ops xfs_bud_item_ops = {
@@ -215,7 +215,7 @@ xfs_trans_get_bud(
{
struct xfs_bud_log_item *budp;
- budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
&xfs_bud_item_ops);
budp->bud_buip = buip;
@@ -384,7 +384,7 @@ xfs_bmap_update_finish_item(
bmap->bi_bmap.br_blockcount = count;
return -EAGAIN;
}
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
return error;
}
@@ -404,7 +404,7 @@ xfs_bmap_update_cancel_item(
struct xfs_bmap_intent *bmap;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
}
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
@@ -532,7 +532,7 @@ xfs_bui_item_recover(
* Commit transaction, which frees the transaction and saves the inode
* for later replay activities.
*/
- error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
if (error)
goto err_unlock;
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index b9be62f8bd52..3fafd3881a0b 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -25,7 +25,7 @@
/* kernel only BUI/BUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -65,7 +65,7 @@ struct xfs_bud_log_item {
struct xfs_bud_log_format bud_format;
};
-extern struct kmem_zone *xfs_bui_zone;
-extern struct kmem_zone *xfs_bud_zone;
+extern struct kmem_cache *xfs_bui_cache;
+extern struct kmem_cache *xfs_bud_cache;
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5fa6cd947dd4..631c5a61d89b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -20,7 +20,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
-static kmem_zone_t *xfs_buf_zone;
+static struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
@@ -220,7 +220,7 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
+ bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -247,7 +247,7 @@ _xfs_buf_alloc(
*/
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
return error;
}
@@ -307,7 +307,7 @@ xfs_buf_free(
kmem_free(bp->b_addr);
xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
}
static int
@@ -2258,12 +2258,12 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD,
NULL);
- if (!xfs_buf_zone)
+ if (!xfs_buf_cache)
goto out;
return 0;
@@ -2275,7 +2275,7 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- kmem_cache_destroy(xfs_buf_zone);
+ kmem_cache_destroy(xfs_buf_cache);
}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index b1ab100c09e1..a7a8e4528881 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -23,7 +23,7 @@
#include "xfs_log.h"
-kmem_zone_t *xfs_buf_item_zone;
+struct kmem_cache *xfs_buf_item_cache;
static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
{
@@ -804,7 +804,7 @@ xfs_buf_item_init(
return 0;
}
- bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL);
+ bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
@@ -825,7 +825,7 @@ xfs_buf_item_init(
map_size = DIV_ROUND_UP(chunks, NBWORD);
if (map_size > XFS_BLF_DATAMAP_SIZE) {
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
xfs_err(mp,
"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
map_size,
@@ -1002,7 +1002,7 @@ xfs_buf_item_free(
{
xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
}
/*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 50aa0f5ef959..e11e9ef2338f 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -71,6 +71,6 @@ static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
-extern kmem_zone_t *xfs_buf_item_zone;
+extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index a476c7ef5d53..70ca5751b13e 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -603,7 +603,7 @@ xlog_recover_do_inode_buffer(
inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
- offsetof(xfs_dinode_t, di_next_unlinked);
+ offsetof(struct xfs_dinode, di_next_unlinked);
while (next_unlinked_offset >=
(reg_buf_offset + reg_buf_bytes)) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c15d61d47a06..e48ae227bb11 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -38,8 +38,8 @@
* otherwise by the lowest id first, see xfs_dqlock2.
*/
-struct kmem_zone *xfs_qm_dqtrxzone;
-static struct kmem_zone *xfs_qm_dqzone;
+struct kmem_cache *xfs_dqtrx_cache;
+static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
@@ -57,7 +57,7 @@ xfs_qm_dqdestroy(
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
- kmem_cache_free(xfs_qm_dqzone, dqp);
+ kmem_cache_free(xfs_dquot_cache, dqp);
}
/*
@@ -458,7 +458,7 @@ xfs_dquot_alloc(
{
struct xfs_dquot *dqp;
- dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL);
+ dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL);
dqp->q_type = type;
dqp->q_id = id;
@@ -471,7 +471,7 @@ xfs_dquot_alloc(
* Offset of dquot in the (fixed sized) dquot chunk.
*/
dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
/*
* Because we want to use a counting completion, complete
@@ -1363,22 +1363,22 @@ xfs_dqlock2(
int __init
xfs_qm_init(void)
{
- xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
+ xfs_dquot_cache = kmem_cache_create("xfs_dquot",
sizeof(struct xfs_dquot),
0, 0, NULL);
- if (!xfs_qm_dqzone)
+ if (!xfs_dquot_cache)
goto out;
- xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
+ xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx",
sizeof(struct xfs_dquot_acct),
0, 0, NULL);
- if (!xfs_qm_dqtrxzone)
- goto out_free_dqzone;
+ if (!xfs_dqtrx_cache)
+ goto out_free_dquot_cache;
return 0;
-out_free_dqzone:
- kmem_cache_destroy(xfs_qm_dqzone);
+out_free_dquot_cache:
+ kmem_cache_destroy(xfs_dquot_cache);
out:
return -ENOMEM;
}
@@ -1386,8 +1386,8 @@ out:
void
xfs_qm_exit(void)
{
- kmem_cache_destroy(xfs_qm_dqtrxzone);
- kmem_cache_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_dqtrx_cache);
+ kmem_cache_destroy(xfs_dquot_cache);
}
/*
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3f8a0713573a..47ef9c9c5c17 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -25,8 +25,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_efi_zone;
-kmem_zone_t *xfs_efd_zone;
+struct kmem_cache *xfs_efi_cache;
+struct kmem_cache *xfs_efd_cache;
static const struct xfs_item_ops xfs_efi_item_ops;
@@ -43,7 +43,7 @@ xfs_efi_item_free(
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
- kmem_cache_free(xfs_efi_zone, efip);
+ kmem_cache_free(xfs_efi_cache, efip);
}
/*
@@ -161,7 +161,7 @@ xfs_efi_init(
((nextents - 1) * sizeof(xfs_extent_t)));
efip = kmem_zalloc(size, 0);
} else {
- efip = kmem_cache_zalloc(xfs_efi_zone,
+ efip = kmem_cache_zalloc(xfs_efi_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -241,7 +241,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
- kmem_cache_free(xfs_efd_zone, efdp);
+ kmem_cache_free(xfs_efd_cache, efdp);
}
/*
@@ -333,7 +333,7 @@ xfs_trans_get_efd(
(nextents - 1) * sizeof(struct xfs_extent),
0);
} else {
- efdp = kmem_cache_zalloc(xfs_efd_zone,
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -474,15 +474,21 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *free;
int error;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ oinfo.oi_owner = free->xefi_owner;
+ if (free->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
error = xfs_trans_free_extent(tp, EFD_ITEM(done),
free->xefi_startblock,
free->xefi_blockcount,
- &free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
+ &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
@@ -502,7 +508,7 @@ xfs_extent_free_cancel_item(
struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
@@ -525,6 +531,7 @@ xfs_agfl_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
struct xfs_extent_free_item *free;
@@ -539,13 +546,13 @@ xfs_agfl_free_finish_item(
ASSERT(free->xefi_blockcount == 1);
agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+ oinfo.oi_owner = free->xefi_owner;
trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp,
- &free->xefi_oinfo);
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -564,7 +571,7 @@ xfs_agfl_free_finish_item(
extp->ext_len = free->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
@@ -637,7 +644,7 @@ xfs_efi_item_recover(
}
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index cd2860c875bf..186d0f2137f1 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -9,7 +9,7 @@
/* kernel only EFI/EFD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
*/
#define XFS_EFD_MAX_FAST_EXTENTS 16
-extern struct kmem_zone *xfs_efi_zone;
-extern struct kmem_zone *xfs_efd_zone;
+extern struct kmem_cache *xfs_efi_cache;
+extern struct kmem_cache *xfs_efd_cache;
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7aa943edfc02..27594738b0d1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -259,7 +259,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0);
+ &xfs_dio_write_ops, 0, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -647,7 +647,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, flags);
+ &xfs_dio_write_ops, flags, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO
@@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f2210d927481..e1472004170e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -77,10 +77,10 @@ xfs_inode_alloc(
* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
* and return NULL here on ENOMEM.
*/
- ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
+ ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
return NULL;
}
@@ -130,11 +130,11 @@ xfs_inode_free_callback(
if (ip->i_afp) {
xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_afp);
}
if (ip->i_cowfp) {
xfs_idestroy_fork(ip->i_cowfp);
- kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
}
if (ip->i_itemp) {
ASSERT(!test_bit(XFS_LI_IN_AIL,
@@ -143,7 +143,7 @@ xfs_inode_free_callback(
ip->i_itemp = NULL;
}
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
}
static void
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 017904a34c02..508e184e3b8f 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -20,7 +20,7 @@
#include "xfs_ialloc.h"
#include "xfs_trace.h"
-kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+struct kmem_cache *xfs_icreate_cache; /* inode create item */
static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
{
@@ -63,7 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
+ kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
static const struct xfs_item_ops xfs_icreate_item_ops = {
@@ -97,7 +97,7 @@ xfs_icreate_log(
{
struct xfs_icreate_item *icp;
- icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL);
+ icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
index a50d0b01e15a..64992823108a 100644
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -12,7 +12,7 @@ struct xfs_icreate_item {
struct xfs_icreate_log ic_format;
};
-extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+extern struct kmem_cache *xfs_icreate_cache; /* inode create item */
void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t agbno, unsigned int count,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a4f6f034fb81..64b9bf334806 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -36,7 +36,7 @@
#include "xfs_reflink.h"
#include "xfs_ag.h"
-kmem_zone_t *xfs_inode_zone;
+struct kmem_cache *xfs_inode_cache;
/*
* Used in xfs_itruncate_extents(). This is the maximum number of extents
@@ -564,8 +564,6 @@ xfs_lock_two_inodes(
struct xfs_inode *ip1,
uint ip1_mode)
{
- struct xfs_inode *temp;
- uint mode_temp;
int attempts = 0;
struct xfs_log_item *lp;
@@ -578,12 +576,8 @@ xfs_lock_two_inodes(
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
- temp = ip0;
- ip0 = ip1;
- ip1 = temp;
- mode_temp = ip0_mode;
- ip0_mode = ip1_mode;
- ip1_mode = mode_temp;
+ swap(ip0, ip1);
+ swap(ip0_mode, ip1_mode);
}
again:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b21b177832d1..e635a3d64cba 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
void xfs_irele(struct xfs_inode *ip);
-extern struct kmem_zone *xfs_inode_zone;
+extern struct kmem_cache *xfs_inode_cache;
/* The default CoW extent size hint. */
#define XFS_DEFAULT_COWEXTSZ_HINT 32
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0659d19c211e..90d8e591baf8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -21,7 +21,7 @@
#include <linux/iversion.h>
-kmem_zone_t *xfs_ili_zone; /* inode log item zone */
+struct kmem_cache *xfs_ili_cache; /* inode log item */
static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
{
@@ -672,7 +672,7 @@ xfs_inode_item_init(
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone,
+ iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache,
GFP_KERNEL | __GFP_NOFAIL);
iip->ili_inode = ip;
@@ -694,7 +694,7 @@ xfs_inode_item_destroy(
ip->i_itemp = NULL;
kmem_free(iip->ili_item.li_lv_shadow);
- kmem_cache_free(xfs_ili_zone, iip);
+ kmem_cache_free(xfs_ili_cache, iip);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 403b45ab9aa2..1a302000d604 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -47,6 +47,6 @@ extern void xfs_iflush_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
-extern struct kmem_zone *xfs_ili_zone;
+extern struct kmem_cache *xfs_ili_cache;
#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0c795dc093ef..174cd8950cb6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1547,7 +1547,7 @@ xfs_ioc_getbmap(
if (bmx.bmv_count > ULONG_MAX / recsize)
return -ENOMEM;
- buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
+ buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1601,11 +1601,11 @@ xfs_ioc_getfsmap(
*/
count = min_t(unsigned int, head.fmh_count,
131072 / sizeof(struct fsmap));
- recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
if (!recs) {
count = min_t(unsigned int, head.fmh_count,
PAGE_SIZE / sizeof(struct fsmap));
- recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
if (!recs)
return -ENOMEM;
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f6cd2d4aa770..89fec9a18c34 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,7 +21,7 @@
#include "xfs_sb.h"
#include "xfs_health.h"
-kmem_zone_t *xfs_log_ticket_zone;
+struct kmem_cache *xfs_log_ticket_cache;
/* Local miscellaneous function prototypes */
STATIC struct xlog *
@@ -3487,7 +3487,7 @@ xfs_log_ticket_put(
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_cache_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_cache, ticket);
}
xlog_ticket_t *
@@ -3611,7 +3611,7 @@ xlog_ticket_alloc(
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 844fbeec3545..23103d68423c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -497,7 +497,7 @@ xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
-extern kmem_zone_t *xfs_log_ticket_zone;
+extern struct kmem_cache *xfs_log_ticket_cache;
struct xlog_ticket *
xlog_ticket_alloc(
struct xlog *log,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 10562ecbd9ea..53366cc0bc9e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2466,11 +2466,11 @@ xlog_finish_defer_ops(
{
struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- struct xfs_inode *ip;
int error = 0;
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
struct xfs_trans_res resv;
+ struct xfs_defer_resources dres;
/*
* Create a new transaction reservation from the captured
@@ -2494,13 +2494,9 @@ xlog_finish_defer_ops(
* from recovering a single intent item.
*/
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_continue(dfc, tp, &ip);
-
+ xfs_defer_ops_continue(dfc, tp, &dres);
error = xfs_trans_commit(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+ xfs_defer_resources_rele(&dres);
if (error)
return error;
}
@@ -2520,7 +2516,7 @@ xlog_abort_defer_ops(
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
list_del_init(&dfc->dfc_list);
- xfs_defer_ops_release(mp, dfc);
+ xfs_defer_ops_capture_free(mp, dfc);
}
}
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 06dac09eddbd..359109b6f0d3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,6 +567,18 @@ xfs_mount_setup_inode_geom(
xfs_ialloc_setup_geometry(mp);
}
+/* Compute maximum possible height for per-AG btree types for this fs. */
+static inline void
+xfs_agbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int levels;
+
+ levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
+ levels = max(levels, mp->m_rmap_maxlevels);
+ mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
+}
+
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
@@ -638,6 +650,8 @@ xfs_mountfs(
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ xfs_agbtree_compute_maxlevels(mp);
+
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
* is NOT aligned turn off m_dalign since allocator alignment is within
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e091f3b3fa15..00720a02e761 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -128,10 +128,11 @@ typedef struct xfs_mount {
uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
- uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
- uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+ uint m_alloc_maxlevels; /* max alloc btree levels */
+ uint m_bm_maxlevels[2]; /* max bmap btree levels */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
+ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 34c3b16f834f..f85e3b07ab44 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -219,7 +219,7 @@ _xfs_mru_cache_list_insert(
* When destroying or reaping, all the elements that were migrated to the reap
* list need to be deleted. For each element this involves removing it from the
* data store, removing it from the reap list, calling the client's free
- * function and deleting the element from the element zone.
+ * function and deleting the element from the element cache.
*
* We get called holding the mru->lock, which we drop and then reacquire.
* Sparse need special help with this to tell it we know what we are doing.
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5608066d6e53..32ac8d9c8940 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -850,7 +850,7 @@ xfs_qm_reset_dqcounts(
*/
#ifdef DEBUG
j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 442a0f97a9d4..5bb12717ea28 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -11,7 +11,7 @@
struct xfs_inode;
-extern struct kmem_zone *xfs_qm_dqtrxzone;
+extern struct kmem_cache *xfs_dqtrx_cache;
/*
* Number of bmaps that we ask from bmapi when doing a quotacheck.
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 46904b793bd4..d3da67772d57 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -21,8 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_cui_zone;
-kmem_zone_t *xfs_cud_zone;
+struct kmem_cache *xfs_cui_cache;
+struct kmem_cache *xfs_cud_cache;
static const struct xfs_item_ops xfs_cui_item_ops;
@@ -38,7 +38,7 @@ xfs_cui_item_free(
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
- kmem_cache_free(xfs_cui_zone, cuip);
+ kmem_cache_free(xfs_cui_cache, cuip);
}
/*
@@ -143,7 +143,7 @@ xfs_cui_init(
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
0);
else
- cuip = kmem_cache_zalloc(xfs_cui_zone,
+ cuip = kmem_cache_zalloc(xfs_cui_cache,
GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
@@ -204,7 +204,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_cache_free(xfs_cud_zone, cudp);
+ kmem_cache_free(xfs_cud_cache, cudp);
}
static const struct xfs_item_ops xfs_cud_item_ops = {
@@ -221,7 +221,7 @@ xfs_trans_get_cud(
{
struct xfs_cud_log_item *cudp;
- cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
&xfs_cud_item_ops);
cudp->cud_cuip = cuip;
@@ -384,7 +384,7 @@ xfs_refcount_update_finish_item(
refc->ri_blockcount = new_aglen;
return -EAGAIN;
}
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
return error;
}
@@ -404,7 +404,7 @@ xfs_refcount_update_cancel_item(
struct xfs_refcount_intent *refc;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
}
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
@@ -557,7 +557,7 @@ xfs_cui_item_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index f4f2e836540b..eb0ab13682d0 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -25,7 +25,7 @@
/* kernel only CUI/CUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -68,7 +68,7 @@ struct xfs_cud_log_item {
struct xfs_cud_log_format cud_format;
};
-extern struct kmem_zone *xfs_cui_zone;
-extern struct kmem_zone *xfs_cud_zone;
+extern struct kmem_cache *xfs_cui_cache;
+extern struct kmem_cache *xfs_cud_cache;
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 76355f293488..cb0edb1d68ef 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -484,7 +484,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_bmap_add_free(*tpp, del.br_startblock,
+ xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
/* Roll the transaction */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 5f0695980467..c3966b4c58ef 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -21,8 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-kmem_zone_t *xfs_rui_zone;
-kmem_zone_t *xfs_rud_zone;
+struct kmem_cache *xfs_rui_cache;
+struct kmem_cache *xfs_rud_cache;
static const struct xfs_item_ops xfs_rui_item_ops;
@@ -38,7 +38,7 @@ xfs_rui_item_free(
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
- kmem_cache_free(xfs_rui_zone, ruip);
+ kmem_cache_free(xfs_rui_cache, ruip);
}
/*
@@ -141,7 +141,7 @@ xfs_rui_init(
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_cache_zalloc(xfs_rui_zone,
+ ruip = kmem_cache_zalloc(xfs_rui_cache,
GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
@@ -227,7 +227,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_cache_free(xfs_rud_zone, rudp);
+ kmem_cache_free(xfs_rud_cache, rudp);
}
static const struct xfs_item_ops xfs_rud_item_ops = {
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
{
struct xfs_rud_log_item *rudp;
- rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL);
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
&xfs_rud_item_ops);
rudp->rud_ruip = ruip;
@@ -427,7 +427,7 @@ xfs_rmap_update_finish_item(
rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
state);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
return error;
}
@@ -447,7 +447,7 @@ xfs_rmap_update_cancel_item(
struct xfs_rmap_intent *rmap;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
}
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
@@ -587,7 +587,7 @@ xfs_rui_item_recover(
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 31e6cdfff71f..802e5119eaca 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -28,7 +28,7 @@
/* kernel only RUI/RUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -68,7 +68,7 @@ struct xfs_rud_log_item {
struct xfs_rud_log_format rud_format;
};
-extern struct kmem_zone *xfs_rui_zone;
-extern struct kmem_zone *xfs_rud_zone;
+extern struct kmem_cache *xfs_rui_cache;
+extern struct kmem_cache *xfs_rud_cache;
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c4e0cd1c1c8c..e21459f9923a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -37,6 +37,7 @@
#include "xfs_reflink.h"
#include "xfs_pwork.h"
#include "xfs_ag.h"
+#include "xfs_defer.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -1951,196 +1952,194 @@ static struct file_system_type xfs_fs_type = {
MODULE_ALIAS_FS("xfs");
STATIC int __init
-xfs_init_zones(void)
+xfs_init_caches(void)
{
- xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket",
+ int error;
+
+ xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
sizeof(struct xlog_ticket),
0, 0, NULL);
- if (!xfs_log_ticket_zone)
+ if (!xfs_log_ticket_cache)
goto out;
- xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item",
- sizeof(struct xfs_extent_free_item),
- 0, 0, NULL);
- if (!xfs_bmap_free_item_zone)
- goto out_destroy_log_ticket_zone;
+ error = xfs_btree_init_cur_caches();
+ if (error)
+ goto out_destroy_log_ticket_cache;
- xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur",
- sizeof(struct xfs_btree_cur),
- 0, 0, NULL);
- if (!xfs_btree_cur_zone)
- goto out_destroy_bmap_free_item_zone;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_btree_cur_cache;
- xfs_da_state_zone = kmem_cache_create("xfs_da_state",
+ xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
- if (!xfs_da_state_zone)
- goto out_destroy_btree_cur_zone;
+ if (!xfs_da_state_cache)
+ goto out_destroy_defer_item_cache;
- xfs_ifork_zone = kmem_cache_create("xfs_ifork",
+ xfs_ifork_cache = kmem_cache_create("xfs_ifork",
sizeof(struct xfs_ifork),
0, 0, NULL);
- if (!xfs_ifork_zone)
- goto out_destroy_da_state_zone;
+ if (!xfs_ifork_cache)
+ goto out_destroy_da_state_cache;
- xfs_trans_zone = kmem_cache_create("xfs_trans",
+ xfs_trans_cache = kmem_cache_create("xfs_trans",
sizeof(struct xfs_trans),
0, 0, NULL);
- if (!xfs_trans_zone)
- goto out_destroy_ifork_zone;
+ if (!xfs_trans_cache)
+ goto out_destroy_ifork_cache;
/*
- * The size of the zone allocated buf log item is the maximum
+ * The size of the cache-allocated buf log item is the maximum
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_cache_create("xfs_buf_item",
+ xfs_buf_item_cache = kmem_cache_create("xfs_buf_item",
sizeof(struct xfs_buf_log_item),
0, 0, NULL);
- if (!xfs_buf_item_zone)
- goto out_destroy_trans_zone;
+ if (!xfs_buf_item_cache)
+ goto out_destroy_trans_cache;
- xfs_efd_zone = kmem_cache_create("xfs_efd_item",
+ xfs_efd_cache = kmem_cache_create("xfs_efd_item",
(sizeof(struct xfs_efd_log_item) +
(XFS_EFD_MAX_FAST_EXTENTS - 1) *
sizeof(struct xfs_extent)),
0, 0, NULL);
- if (!xfs_efd_zone)
- goto out_destroy_buf_item_zone;
+ if (!xfs_efd_cache)
+ goto out_destroy_buf_item_cache;
- xfs_efi_zone = kmem_cache_create("xfs_efi_item",
+ xfs_efi_cache = kmem_cache_create("xfs_efi_item",
(sizeof(struct xfs_efi_log_item) +
(XFS_EFI_MAX_FAST_EXTENTS - 1) *
sizeof(struct xfs_extent)),
0, 0, NULL);
- if (!xfs_efi_zone)
- goto out_destroy_efd_zone;
+ if (!xfs_efi_cache)
+ goto out_destroy_efd_cache;
- xfs_inode_zone = kmem_cache_create("xfs_inode",
+ xfs_inode_cache = kmem_cache_create("xfs_inode",
sizeof(struct xfs_inode), 0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD | SLAB_ACCOUNT),
xfs_fs_inode_init_once);
- if (!xfs_inode_zone)
- goto out_destroy_efi_zone;
+ if (!xfs_inode_cache)
+ goto out_destroy_efi_cache;
- xfs_ili_zone = kmem_cache_create("xfs_ili",
+ xfs_ili_cache = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL);
- if (!xfs_ili_zone)
- goto out_destroy_inode_zone;
+ if (!xfs_ili_cache)
+ goto out_destroy_inode_cache;
- xfs_icreate_zone = kmem_cache_create("xfs_icr",
+ xfs_icreate_cache = kmem_cache_create("xfs_icr",
sizeof(struct xfs_icreate_item),
0, 0, NULL);
- if (!xfs_icreate_zone)
- goto out_destroy_ili_zone;
+ if (!xfs_icreate_cache)
+ goto out_destroy_ili_cache;
- xfs_rud_zone = kmem_cache_create("xfs_rud_item",
+ xfs_rud_cache = kmem_cache_create("xfs_rud_item",
sizeof(struct xfs_rud_log_item),
0, 0, NULL);
- if (!xfs_rud_zone)
- goto out_destroy_icreate_zone;
+ if (!xfs_rud_cache)
+ goto out_destroy_icreate_cache;
- xfs_rui_zone = kmem_cache_create("xfs_rui_item",
+ xfs_rui_cache = kmem_cache_create("xfs_rui_item",
xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_rui_zone)
- goto out_destroy_rud_zone;
+ if (!xfs_rui_cache)
+ goto out_destroy_rud_cache;
- xfs_cud_zone = kmem_cache_create("xfs_cud_item",
+ xfs_cud_cache = kmem_cache_create("xfs_cud_item",
sizeof(struct xfs_cud_log_item),
0, 0, NULL);
- if (!xfs_cud_zone)
- goto out_destroy_rui_zone;
+ if (!xfs_cud_cache)
+ goto out_destroy_rui_cache;
- xfs_cui_zone = kmem_cache_create("xfs_cui_item",
+ xfs_cui_cache = kmem_cache_create("xfs_cui_item",
xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_cui_zone)
- goto out_destroy_cud_zone;
+ if (!xfs_cui_cache)
+ goto out_destroy_cud_cache;
- xfs_bud_zone = kmem_cache_create("xfs_bud_item",
+ xfs_bud_cache = kmem_cache_create("xfs_bud_item",
sizeof(struct xfs_bud_log_item),
0, 0, NULL);
- if (!xfs_bud_zone)
- goto out_destroy_cui_zone;
+ if (!xfs_bud_cache)
+ goto out_destroy_cui_cache;
- xfs_bui_zone = kmem_cache_create("xfs_bui_item",
+ xfs_bui_cache = kmem_cache_create("xfs_bui_item",
xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_bui_zone)
- goto out_destroy_bud_zone;
+ if (!xfs_bui_cache)
+ goto out_destroy_bud_cache;
return 0;
- out_destroy_bud_zone:
- kmem_cache_destroy(xfs_bud_zone);
- out_destroy_cui_zone:
- kmem_cache_destroy(xfs_cui_zone);
- out_destroy_cud_zone:
- kmem_cache_destroy(xfs_cud_zone);
- out_destroy_rui_zone:
- kmem_cache_destroy(xfs_rui_zone);
- out_destroy_rud_zone:
- kmem_cache_destroy(xfs_rud_zone);
- out_destroy_icreate_zone:
- kmem_cache_destroy(xfs_icreate_zone);
- out_destroy_ili_zone:
- kmem_cache_destroy(xfs_ili_zone);
- out_destroy_inode_zone:
- kmem_cache_destroy(xfs_inode_zone);
- out_destroy_efi_zone:
- kmem_cache_destroy(xfs_efi_zone);
- out_destroy_efd_zone:
- kmem_cache_destroy(xfs_efd_zone);
- out_destroy_buf_item_zone:
- kmem_cache_destroy(xfs_buf_item_zone);
- out_destroy_trans_zone:
- kmem_cache_destroy(xfs_trans_zone);
- out_destroy_ifork_zone:
- kmem_cache_destroy(xfs_ifork_zone);
- out_destroy_da_state_zone:
- kmem_cache_destroy(xfs_da_state_zone);
- out_destroy_btree_cur_zone:
- kmem_cache_destroy(xfs_btree_cur_zone);
- out_destroy_bmap_free_item_zone:
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- out_destroy_log_ticket_zone:
- kmem_cache_destroy(xfs_log_ticket_zone);
+ out_destroy_bud_cache:
+ kmem_cache_destroy(xfs_bud_cache);
+ out_destroy_cui_cache:
+ kmem_cache_destroy(xfs_cui_cache);
+ out_destroy_cud_cache:
+ kmem_cache_destroy(xfs_cud_cache);
+ out_destroy_rui_cache:
+ kmem_cache_destroy(xfs_rui_cache);
+ out_destroy_rud_cache:
+ kmem_cache_destroy(xfs_rud_cache);
+ out_destroy_icreate_cache:
+ kmem_cache_destroy(xfs_icreate_cache);
+ out_destroy_ili_cache:
+ kmem_cache_destroy(xfs_ili_cache);
+ out_destroy_inode_cache:
+ kmem_cache_destroy(xfs_inode_cache);
+ out_destroy_efi_cache:
+ kmem_cache_destroy(xfs_efi_cache);
+ out_destroy_efd_cache:
+ kmem_cache_destroy(xfs_efd_cache);
+ out_destroy_buf_item_cache:
+ kmem_cache_destroy(xfs_buf_item_cache);
+ out_destroy_trans_cache:
+ kmem_cache_destroy(xfs_trans_cache);
+ out_destroy_ifork_cache:
+ kmem_cache_destroy(xfs_ifork_cache);
+ out_destroy_da_state_cache:
+ kmem_cache_destroy(xfs_da_state_cache);
+ out_destroy_defer_item_cache:
+ xfs_defer_destroy_item_caches();
+ out_destroy_btree_cur_cache:
+ xfs_btree_destroy_cur_caches();
+ out_destroy_log_ticket_cache:
+ kmem_cache_destroy(xfs_log_ticket_cache);
out:
return -ENOMEM;
}
STATIC void
-xfs_destroy_zones(void)
+xfs_destroy_caches(void)
{
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(xfs_bui_zone);
- kmem_cache_destroy(xfs_bud_zone);
- kmem_cache_destroy(xfs_cui_zone);
- kmem_cache_destroy(xfs_cud_zone);
- kmem_cache_destroy(xfs_rui_zone);
- kmem_cache_destroy(xfs_rud_zone);
- kmem_cache_destroy(xfs_icreate_zone);
- kmem_cache_destroy(xfs_ili_zone);
- kmem_cache_destroy(xfs_inode_zone);
- kmem_cache_destroy(xfs_efi_zone);
- kmem_cache_destroy(xfs_efd_zone);
- kmem_cache_destroy(xfs_buf_item_zone);
- kmem_cache_destroy(xfs_trans_zone);
- kmem_cache_destroy(xfs_ifork_zone);
- kmem_cache_destroy(xfs_da_state_zone);
- kmem_cache_destroy(xfs_btree_cur_zone);
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- kmem_cache_destroy(xfs_log_ticket_zone);
+ kmem_cache_destroy(xfs_bui_cache);
+ kmem_cache_destroy(xfs_bud_cache);
+ kmem_cache_destroy(xfs_cui_cache);
+ kmem_cache_destroy(xfs_cud_cache);
+ kmem_cache_destroy(xfs_rui_cache);
+ kmem_cache_destroy(xfs_rud_cache);
+ kmem_cache_destroy(xfs_icreate_cache);
+ kmem_cache_destroy(xfs_ili_cache);
+ kmem_cache_destroy(xfs_inode_cache);
+ kmem_cache_destroy(xfs_efi_cache);
+ kmem_cache_destroy(xfs_efd_cache);
+ kmem_cache_destroy(xfs_buf_item_cache);
+ kmem_cache_destroy(xfs_trans_cache);
+ kmem_cache_destroy(xfs_ifork_cache);
+ kmem_cache_destroy(xfs_da_state_cache);
+ xfs_defer_destroy_item_caches();
+ xfs_btree_destroy_cur_caches();
+ kmem_cache_destroy(xfs_log_ticket_cache);
}
STATIC int __init
@@ -2233,13 +2232,13 @@ init_xfs_fs(void)
if (error)
goto out;
- error = xfs_init_zones();
+ error = xfs_init_caches();
if (error)
goto out_destroy_hp;
error = xfs_init_workqueues();
if (error)
- goto out_destroy_zones;
+ goto out_destroy_caches;
error = xfs_mru_cache_init();
if (error)
@@ -2314,8 +2313,8 @@ init_xfs_fs(void)
xfs_mru_cache_uninit();
out_destroy_wq:
xfs_destroy_workqueues();
- out_destroy_zones:
- xfs_destroy_zones();
+ out_destroy_caches:
+ xfs_destroy_caches();
out_destroy_hp:
xfs_cpu_hotplug_destroy();
out:
@@ -2338,7 +2337,7 @@ exit_xfs_fs(void)
xfs_buf_terminate();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
- xfs_destroy_zones();
+ xfs_destroy_caches();
xfs_uuid_table_free();
xfs_cpu_hotplug_destroy();
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 18dc5eca6c04..8608f804388f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -105,7 +105,7 @@ bug_on_assert_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0);
+ return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert);
}
XFS_SYSFS_ATTR_RW(bug_on_assert);
@@ -135,7 +135,7 @@ log_recovery_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay);
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
@@ -165,7 +165,7 @@ mount_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay);
}
XFS_SYSFS_ATTR_RW(mount_delay);
@@ -188,7 +188,7 @@ always_cow_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+ return sysfs_emit(buf, "%d\n", xfs_globals.always_cow);
}
XFS_SYSFS_ATTR_RW(always_cow);
@@ -224,7 +224,7 @@ pwork_threads_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+ return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
#endif /* DEBUG */
@@ -327,7 +327,7 @@ log_head_lsn_show(
block = log->l_curr_block;
spin_unlock(&log->l_icloglock);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_head_lsn);
@@ -341,7 +341,7 @@ log_tail_lsn_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_tail_lsn);
@@ -356,7 +356,7 @@ reserve_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(reserve_grant_head);
@@ -370,7 +370,7 @@ write_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
@@ -425,7 +425,7 @@ max_retries_show(
else
retries = cfg->max_retries;
- return snprintf(buf, PAGE_SIZE, "%d\n", retries);
+ return sysfs_emit(buf, "%d\n", retries);
}
static ssize_t
@@ -466,7 +466,7 @@ retry_timeout_seconds_show(
else
timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
- return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
+ return sysfs_emit(buf, "%d\n", timeout);
}
static ssize_t
@@ -504,7 +504,7 @@ fail_at_unmount_show(
{
struct xfs_mount *mp = err_to_mp(kobject);
- return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+ return sysfs_emit(buf, "%d\n", mp->m_fail_unmount);
}
static ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1033a95fbf8e..4a8076ef8cb4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2476,7 +2476,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
__entry->btnum = cur->bc_btnum;
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 67dec11e34c7..234a9d9c2f43 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -25,7 +25,7 @@
#include "xfs_dquot.h"
#include "xfs_icache.h"
-kmem_zone_t *xfs_trans_zone;
+struct kmem_cache *xfs_trans_cache;
#if defined(CONFIG_TRACEPOINTS)
static void
@@ -76,7 +76,7 @@ xfs_trans_free(
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
- kmem_cache_free(xfs_trans_zone, tp);
+ kmem_cache_free(xfs_trans_cache, tp);
}
/*
@@ -95,7 +95,7 @@ xfs_trans_dup(
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
+ ntp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
xfs_trans_set_context(tp);
@@ -477,7 +477,7 @@ STATIC void
xfs_trans_apply_sb_deltas(
xfs_trans_t *tp)
{
- xfs_dsb_t *sbp;
+ struct xfs_dsb *sbp;
struct xfs_buf *bp;
int whole = 0;
@@ -541,14 +541,14 @@ xfs_trans_apply_sb_deltas(
/*
* Log the whole thing, the fields are noncontiguous.
*/
- xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
else
/*
* Since all the modifiable fields are contiguous, we
* can get away with this.
*/
- xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
- offsetof(xfs_dsb_t, sb_frextents) +
+ xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount),
+ offsetof(struct xfs_dsb, sb_frextents) +
sizeof(sbp->sb_frextents) - 1);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 50da47f23a07..a487b264a9eb 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,12 +113,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
#define XFS_ITEM_FLUSHING 3
/*
- * Deferred operation item relogging limits.
- */
-#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
-#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
-
-/*
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
@@ -243,7 +237,7 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
struct xfs_buf *src_bp);
-extern kmem_zone_t *xfs_trans_zone;
+extern struct kmem_cache *xfs_trans_cache;
static inline struct xfs_log_item *
xfs_trans_item_relog(
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 3872ce671411..9ba7e6b9bed3 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -846,7 +846,7 @@ STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone,
+ tp->t_dqinfo = kmem_cache_zalloc(xfs_dqtrx_cache,
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -856,6 +856,6 @@ xfs_trans_free_dqinfo(
{
if (!tp->t_dqinfo)
return;
- kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+ kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index ddc346a9df9b..259ee2bda492 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, 0);
+ &zonefs_write_dio_ops, 0, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, 0);
+ &zonefs_read_dio_ops, 0, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
@@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = {
.write_iter = zonefs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
};
static struct kmem_cache *zonefs_inode_cachep;