aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Makefile4
-rw-r--r--fs/adfs/dir_f.c12
-rw-r--r--fs/affs/amigaffs.c27
-rw-r--r--fs/affs/file.c26
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/cell.c328
-rw-r--r--fs/afs/cmservice.c16
-rw-r--r--fs/afs/dynroot.c43
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c1
-rw-r--r--fs/afs/fs_operation.c1
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/fsclient.c42
-rw-r--r--fs/afs/inode.c47
-rw-r--r--fs/afs/internal.h35
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/misc.c18
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/proc.c28
-rw-r--r--fs/afs/rotate.c2
-rw-r--r--fs/afs/rxrpc.c6
-rw-r--r--fs/afs/server.c7
-rw-r--r--fs/afs/super.c19
-rw-r--r--fs/afs/vl_alias.c8
-rw-r--r--fs/afs/vl_list.c1
-rw-r--r--fs/afs/vl_probe.c82
-rw-r--r--fs/afs/vl_rotate.c9
-rw-r--r--fs/afs/vlclient.c24
-rw-r--r--fs/afs/volume.c6
-rw-r--r--fs/afs/write.c13
-rw-r--r--fs/afs/yfsclient.c50
-rw-r--r--fs/aio.c10
-rw-r--r--fs/autofs/dev-ioctl.c8
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/binfmt_elf.c266
-rw-r--r--fs/binfmt_elf_fdpic.c162
-rw-r--r--fs/binfmt_flat.c20
-rw-r--r--fs/block_dev.c184
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c70
-rw-r--r--fs/btrfs/btrfs_inode.h30
-rw-r--r--fs/btrfs/compression.c35
-rw-r--r--fs/btrfs/compression.h35
-rw-r--r--fs/btrfs/ctree.c212
-rw-r--r--fs/btrfs/ctree.h109
-rw-r--r--fs/btrfs/delalloc-space.c123
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/dev-replace.c118
-rw-r--r--fs/btrfs/disk-io.c173
-rw-r--r--fs/btrfs/disk-io.h9
-rw-r--r--fs/btrfs/extent-io-tree.h3
-rw-r--r--fs/btrfs/extent-tree.c244
-rw-r--r--fs/btrfs/extent_io.c224
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c318
-rw-r--r--fs/btrfs/free-space-cache.c25
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode.c817
-rw-r--r--fs/btrfs/ioctl.c96
-rw-r--r--fs/btrfs/locking.c45
-rw-r--r--fs/btrfs/locking.h78
-rw-r--r--fs/btrfs/ordered-data.c113
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/print-tree.c50
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/reada.c30
-rw-r--r--fs/btrfs/reflink.c46
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c13
-rw-r--r--fs/btrfs/scrub.c130
-rw-r--r--fs/btrfs/send.c365
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/space-info.c323
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/struct-funcs.c10
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/btrfs/sysfs.c249
-rw-r--r--fs/btrfs/sysfs.h11
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c7
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c19
-rw-r--r--fs/btrfs/tree-log.c294
-rw-r--r--fs/btrfs/tree-log.h32
-rw-r--r--fs/btrfs/volumes.c429
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/buffer.c33
-rw-r--r--fs/ceph/addr.c416
-rw-r--r--fs/ceph/caps.c142
-rw-r--r--fs/ceph/debugfs.c20
-rw-r--r--fs/ceph/dir.c53
-rw-r--r--fs/ceph/file.c92
-rw-r--r--fs/ceph/inode.c29
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c109
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/metric.c14
-rw-r--r--fs/ceph/metric.h7
-rw-r--r--fs/ceph/quota.c14
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c8
-rw-r--r--fs/ceph/super.h86
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifssmb.c2
-rw-r--r--fs/cifs/connect.c12
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c236
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c60
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/hooks.c80
-rw-r--r--fs/crypto/inline_crypt.c7
-rw-r--r--fs/crypto/keyring.c9
-rw-r--r--fs/crypto/keysetup.c182
-rw-r--r--fs/crypto/keysetup_v1.c8
-rw-r--r--fs/crypto/policy.c209
-rw-r--r--fs/d_path.c6
-rw-r--r--fs/dax.c44
-rw-r--r--fs/debugfs/file.c4
-rw-r--r--fs/direct-io.c88
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c66
-rw-r--r--fs/dlm/config.h4
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lowcomms.c329
-rw-r--r--fs/dlm/midcomms.c136
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/dlm/netlink.c6
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/zdata.c48
-rw-r--r--fs/erofs/zmap.c6
-rw-r--r--fs/eventpoll.c102
-rw-r--r--fs/exec.c155
-rw-r--r--fs/exfat/cache.c11
-rw-r--r--fs/exfat/dir.c29
-rw-r--r--fs/exfat/exfat_fs.h7
-rw-r--r--fs/exfat/file.c4
-rw-r--r--fs/exfat/inode.c7
-rw-r--r--fs/exfat/namei.c166
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c6
-rw-r--r--fs/ext2/balloc.c6
-rw-r--r--fs/ext2/file.c6
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/block_validity.c159
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h97
-rw-r--r--fs/ext4/ext4_jbd2.c25
-rw-r--r--fs/ext4/extents.c42
-rw-r--r--fs/ext4/file.c11
-rw-r--r--fs/ext4/hash.c4
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/indirect.c20
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c32
-rw-r--r--fs/ext4/ioctl.c34
-rw-r--r--fs/ext4/mballoc.c289
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/namei.c73
-rw-r--r--fs/ext4/readpage.c4
-rw-r--r--fs/ext4/super.c284
-rw-r--r--fs/ext4/sysfs.c13
-rw-r--r--fs/ext4/verity.c4
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/checkpoint.c17
-rw-r--r--fs/f2fs/compress.c242
-rw-r--r--fs/f2fs/data.c122
-rw-r--r--fs/f2fs/debug.c18
-rw-r--r--fs/f2fs/dir.c115
-rw-r--r--fs/f2fs/extent_cache.c37
-rw-r--r--fs/f2fs/f2fs.h145
-rw-r--r--fs/f2fs/file.c88
-rw-r--r--fs/f2fs/gc.c413
-rw-r--r--fs/f2fs/gc.h69
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c21
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c14
-rw-r--r--fs/f2fs/segment.c528
-rw-r--r--fs/f2fs/segment.h71
-rw-r--r--fs/f2fs/super.c183
-rw-r--r--fs/f2fs/sysfs.c22
-rw-r--r--fs/f2fs/verity.c4
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c112
-rw-r--r--fs/fs_context.c2
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fsopen.c2
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/Makefile6
-rw-r--r--fs/fuse/control.c20
-rw-r--r--fs/fuse/cuse.c21
-rw-r--r--fs/fuse/dax.c1365
-rw-r--r--fs/fuse/dev.c189
-rw-r--r--fs/fuse/dir.c220
-rw-r--r--fs/fuse/file.c276
-rw-r--r--fs/fuse/fuse_i.h185
-rw-r--r--fs/fuse/inode.c395
-rw-r--r--fs/fuse/readdir.c10
-rw-r--r--fs/fuse/virtio_fs.c378
-rw-r--r--fs/fuse/xattr.c34
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/log.c31
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/trans.c1
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io-wq.c264
-rw-r--r--fs/io-wq.h14
-rw-r--r--fs/io_uring.c3061
-rw-r--r--fs/iomap/buffered-io.c194
-rw-r--r--fs/iomap/direct-io.c49
-rw-r--r--fs/iomap/seek.c4
-rw-r--r--fs/jbd2/journal.c16
-rw-r--r--fs/jbd2/recovery.c46
-rw-r--r--fs/jbd2/transaction.c33
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/kernel_read_file.c189
-rw-r--r--fs/libfs.c91
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc4proc.c248
-rw-r--r--fs/lockd/svcproc.c250
-rw-r--r--fs/locks.c6
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c29
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c47
-rw-r--r--fs/nfs/fs_context.c218
-rw-r--r--fs/nfs/namespace.c12
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs42proc.c10
-rw-r--r--fs/nfs/nfs42xattr.c5
-rw-r--r--fs/nfs/nfs42xdr.c167
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c43
-rw-r--r--fs/nfs/nfs4idmap.c19
-rw-r--r--fs/nfs/nfs4proc.c315
-rw-r--r--fs/nfs/nfs4state.c14
-rw-r--r--fs/nfs/nfs4super.c5
-rw-r--r--fs/nfs/nfs4trace.h1
-rw-r--r--fs/nfs/nfs4xdr.c7
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c4
-rw-r--r--fs/nfs/super.c28
-rw-r--r--fs/nfs/sysfs.c11
-rw-r--r--fs/nfs/sysfs.h2
-rw-r--r--fs/nfs_common/Makefile1
-rw-r--r--fs/nfs_common/nfs_ssc.c94
-rw-r--r--fs/nfs_common/nfsacl.c2
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/blocklayout.c8
-rw-r--r--fs/nfsd/export.c2
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/nfs2acl.c160
-rw-r--r--fs/nfsd/nfs3acl.c88
-rw-r--r--fs/nfsd/nfs3proc.c238
-rw-r--r--fs/nfsd/nfs3xdr.c25
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c36
-rw-r--r--fs/nfsd/nfs4state.c617
-rw-r--r--fs/nfsd/nfs4xdr.c202
-rw-r--r--fs/nfsd/nfscache.c12
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsproc.c285
-rw-r--r--fs/nfsd/nfssvc.c124
-rw-r--r--fs/nfsd/nfsxdr.c52
-rw-r--r--fs/nfsd/state.h27
-rw-r--r--fs/nfsd/trace.h4
-rw-r--r--fs/nfsd/vfs.c10
-rw-r--r--fs/nfsd/xdr.h16
-rw-r--r--fs/nfsd/xdr3.h1
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/bmap.c4
-rw-r--r--fs/nilfs2/cpfile.c6
-rw-r--r--fs/nilfs2/page.c1
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segment.c19
-rw-r--r--fs/nilfs2/sufile.c4
-rw-r--r--fs/notify/fanotify/fanotify.c5
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c5
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c28
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c88
-rw-r--r--fs/overlayfs/inode.c32
-rw-r--r--fs/overlayfs/namei.c57
-rw-r--r--fs/overlayfs/overlayfs.h92
-rw-r--r--fs/overlayfs/ovl_entry.h6
-rw-r--r--fs/overlayfs/readdir.c76
-rw-r--r--fs/overlayfs/super.c117
-rw-r--r--fs/overlayfs/util.c96
-rw-r--r--fs/pipe.c73
-rw-r--r--fs/proc/base.c7
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c126
-rw-r--r--fs/pstore/zone.c1
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile1
-rw-r--r--fs/quota/compat.c120
-rw-r--r--fs/quota/compat.h34
-rw-r--r--fs/quota/quota.c117
-rw-r--r--fs/quota/quota_v2.c1
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c370
-rw-r--r--fs/reiserfs/inode.c9
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/splice.c85
-rw-r--r--fs/squashfs/block.c6
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/file.c55
-rw-r--r--fs/ubifs/auth.c2
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/dir.c40
-rw-r--r--fs/ubifs/gc.c4
-rw-r--r--fs/ubifs/ioctl.c1
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/lprops.c4
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c46
-rw-r--r--fs/ubifs/tnc.c6
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c7
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c61
-rw-r--r--fs/udf/misc.c6
-rw-r--r--fs/udf/namei.c7
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c47
-rw-r--r--fs/udf/symlink.c4
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/ufs/util.h12
-rw-r--r--fs/unicode/utf8-core.c23
-rw-r--r--fs/userfaultfd.c28
-rw-r--r--fs/vboxsf/dir.c2
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/vboxsf/utils.c2
-rw-r--r--fs/xattr.c22
-rw-r--r--fs/xfs/Kconfig25
-rw-r--r--fs/xfs/kmem.c22
-rw-r--r--fs/xfs/kmem.h7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr.c14
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c47
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h29
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c21
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h24
-rw-r--r--fs/xfs/libxfs/xfs_defer.c232
-rw-r--r--fs/xfs/libxfs/xfs_defer.h37
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c35
-rw-r--r--fs/xfs/libxfs/xfs_format.h211
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c65
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c130
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h17
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c8
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h8
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c27
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/libxfs/xfs_sb.c6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c21
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/scrub/agheader.c30
-rw-r--r--fs/xfs/scrub/agheader_repair.c24
-rw-r--r--fs/xfs/scrub/dabtree.c14
-rw-r--r--fs/xfs/scrub/inode.c31
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_list.c6
-rw-r--r--fs/xfs/xfs_bmap_item.c132
-rw-r--r--fs/xfs/xfs_bmap_util.c18
-rw-r--r--fs/xfs/xfs_buf.c208
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c264
-rw-r--r--fs/xfs/xfs_buf_item.h12
-rw-r--r--fs/xfs/xfs_buf_item_recover.c4
-rw-r--r--fs/xfs/xfs_dquot.c70
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_extfree_item.c44
-rw-r--r--fs/xfs/xfs_file.c29
-rw-r--r--fs/xfs/xfs_filestream.c34
-rw-r--r--fs/xfs/xfs_fsmap.c48
-rw-r--r--fs/xfs/xfs_fsmap.h6
-rw-r--r--fs/xfs/xfs_icache.c19
-rw-r--r--fs/xfs/xfs_inode.c206
-rw-r--r--fs/xfs/xfs_inode.h38
-rw-r--r--fs/xfs/xfs_inode_item.c61
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_inode_item_recover.c76
-rw-r--r--fs/xfs/xfs_ioctl.c151
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log.c44
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_recover.c281
-rw-r--r--fs/xfs/xfs_mount.c32
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_ondisk.h38
-rw-r--r--fs/xfs/xfs_qm.c29
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c18
-rw-r--r--fs/xfs/xfs_quota.h8
-rw-r--r--fs/xfs/xfs_refcount_item.c51
-rw-r--r--fs/xfs/xfs_rmap_item.c42
-rw-r--r--fs/xfs/xfs_rtalloc.c44
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c72
-rw-r--r--fs/xfs/xfs_sysctl.c36
-rw-r--r--fs/xfs/xfs_trace.h30
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans.h35
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_dquot.c49
-rw-r--r--fs/zonefs/super.c221
-rw-r--r--fs/zonefs/zonefs.h10
468 files changed, 18586 insertions, 11209 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 92cd1d80218d..b177fd3b1eb3 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -213,7 +213,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fall through */
+ fallthrough;
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -612,9 +612,9 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = WB_SYNC_ALL,
- .range_start = vma->vm_pgoff * PAGE_SIZE,
+ .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
/* absolute end, byte at end included */
- .range_end = vma->vm_pgoff * PAGE_SIZE +
+ .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
(vma->vm_end - vma->vm_start - 1),
};
@@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
inode = file_inode(vma->vm_file);
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 74df32be4c6a..e34fa20acf61 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (ret)
return ret;
- if (v9ses->cache)
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
+ if (!v9ses->cache) {
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
+ }
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..7bb2a05fda1f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,8 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o init.o
+ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
+ kernel_read_file.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 30d526fecc3f..05e963402e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -18,11 +18,11 @@ static inline unsigned int adfs_readval(unsigned char *p, int len)
switch (len) {
case 4: val |= p[3] << 24;
- /* fall through */
+ fallthrough;
case 3: val |= p[2] << 16;
- /* fall through */
+ fallthrough;
case 2: val |= p[1] << 8;
- /* fall through */
+ fallthrough;
default: val |= p[0];
}
return val;
@@ -32,11 +32,11 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val)
{
switch (len) {
case 4: p[3] = val >> 24;
- /* fall through */
+ fallthrough;
case 3: p[2] = val >> 16;
- /* fall through */
+ fallthrough;
case 2: p[1] = val >> 8;
- /* fall through */
+ fallthrough;
default: p[0] = val;
}
}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index f708c45d5f66..29f11e10a7c7 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -420,24 +420,51 @@ affs_mode_to_prot(struct inode *inode)
u32 prot = AFFS_I(inode)->i_protect;
umode_t mode = inode->i_mode;
+ /*
+ * First, clear all RWED bits for owner, group, other.
+ * Then, recalculate them afresh.
+ *
+ * We'll always clear the delete-inhibit bit for the owner, as that is
+ * the classic single-user mode AmigaOS protection bit and we need to
+ * stay compatible with all scenarios.
+ *
+ * Since multi-user AmigaOS is an extension, we'll only set the
+ * delete-allow bit if any of the other bits in the same user class
+ * (group/other) are used.
+ */
+ prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD
+ | FIBF_NOWRITE | FIBF_NODELETE
+ | FIBF_GRP_EXECUTE | FIBF_GRP_READ
+ | FIBF_GRP_WRITE | FIBF_GRP_DELETE
+ | FIBF_OTR_EXECUTE | FIBF_OTR_READ
+ | FIBF_OTR_WRITE | FIBF_OTR_DELETE);
+
+ /* Classic single-user AmigaOS flags. These are inverted. */
if (!(mode & 0100))
prot |= FIBF_NOEXECUTE;
if (!(mode & 0400))
prot |= FIBF_NOREAD;
if (!(mode & 0200))
prot |= FIBF_NOWRITE;
+
+ /* Multi-user extended flags. Not inverted. */
if (mode & 0010)
prot |= FIBF_GRP_EXECUTE;
if (mode & 0040)
prot |= FIBF_GRP_READ;
if (mode & 0020)
prot |= FIBF_GRP_WRITE;
+ if (mode & 0070)
+ prot |= FIBF_GRP_DELETE;
+
if (mode & 0001)
prot |= FIBF_OTR_EXECUTE;
if (mode & 0004)
prot |= FIBF_OTR_READ;
if (mode & 0002)
prot |= FIBF_OTR_WRITE;
+ if (mode & 0007)
+ prot |= FIBF_OTR_DELETE;
AFFS_I(inode)->i_protect = prot;
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a26a0f96c119..d91b0133d95d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,6 +429,24 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int affs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,affs_get_block);
@@ -438,7 +456,7 @@ const struct address_space_operations affs_aops = {
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
- .write_end = generic_write_end,
+ .write_end = affs_write_end,
.direct_IO = affs_direct_IO,
.bmap = _affs_bmap
};
@@ -795,6 +813,12 @@ done:
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
err_first_bh:
unlock_page(page);
put_page(page);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a346cf7659f1..044412110b52 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -93,7 +93,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
case ST_ROOT:
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;
- /* fall through */
+ fallthrough;
case ST_USERDIR:
if (be32_to_cpu(tail->stype) == ST_USERDIR ||
affs_test_opt(sbi->s_flags, SF_SETMODE)) {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 47107c6712a6..a100cd9950c8 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -474,7 +474,7 @@ got_root:
case MUFS_INTLFFS:
case MUFS_DCFFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall thru */
+ fallthrough;
case FS_INTLFFS:
case FS_DCFFS:
affs_set_opt(sbi->s_flags, SF_INTL);
@@ -486,7 +486,7 @@ got_root:
break;
case MUFS_OFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall through */
+ fallthrough;
case FS_OFS:
affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= SB_NOEXEC;
@@ -494,7 +494,7 @@ got_root:
case MUFS_DCOFS:
case MUFS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_MUFS);
- /* fall through */
+ fallthrough;
case FS_DCOFS:
case FS_INTLOFS:
affs_set_opt(sbi->s_flags, SF_INTL);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5b79cdceefa0..52233fa6195f 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -18,8 +18,10 @@
static unsigned __read_mostly afs_cell_gc_delay = 10;
static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
+static atomic_t cell_debug_id;
-static void afs_manage_cell(struct work_struct *);
+static void afs_queue_cell_manager(struct afs_net *);
+static void afs_manage_cell_work(struct work_struct *);
static void afs_dec_cells_outstanding(struct afs_net *net)
{
@@ -37,19 +39,22 @@ static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
atomic_inc(&net->cells_outstanding);
if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
afs_dec_cells_outstanding(net);
+ } else {
+ afs_queue_cell_manager(net);
}
}
/*
- * Look up and get an activation reference on a cell record under RCU
- * conditions. The caller must hold the RCU read lock.
+ * Look up and get an activation reference on a cell record. The caller must
+ * hold net->cells_lock at least read-locked.
*/
-struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
- const char *name, unsigned int namesz)
+static struct afs_cell *afs_find_cell_locked(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ enum afs_cell_trace reason)
{
struct afs_cell *cell = NULL;
struct rb_node *p;
- int n, seq = 0, ret = 0;
+ int n;
_enter("%*.*s", namesz, namesz, name);
@@ -58,61 +63,48 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
if (namesz > AFS_MAXCELLNAME)
return ERR_PTR(-ENAMETOOLONG);
- do {
- /* Unfortunately, rbtree walking doesn't give reliable results
- * under just the RCU read lock, so we have to check for
- * changes.
- */
- if (cell)
- afs_put_cell(net, cell);
- cell = NULL;
- ret = -ENOENT;
-
- read_seqbegin_or_lock(&net->cells_lock, &seq);
-
- if (!name) {
- cell = rcu_dereference_raw(net->ws_cell);
- if (cell) {
- afs_get_cell(cell);
- ret = 0;
- break;
- }
- ret = -EDESTADDRREQ;
- continue;
- }
+ if (!name) {
+ cell = net->ws_cell;
+ if (!cell)
+ return ERR_PTR(-EDESTADDRREQ);
+ goto found;
+ }
- p = rcu_dereference_raw(net->cells.rb_node);
- while (p) {
- cell = rb_entry(p, struct afs_cell, net_node);
-
- n = strncasecmp(cell->name, name,
- min_t(size_t, cell->name_len, namesz));
- if (n == 0)
- n = cell->name_len - namesz;
- if (n < 0) {
- p = rcu_dereference_raw(p->rb_left);
- } else if (n > 0) {
- p = rcu_dereference_raw(p->rb_right);
- } else {
- if (atomic_inc_not_zero(&cell->usage)) {
- ret = 0;
- break;
- }
- /* We want to repeat the search, this time with
- * the lock properly locked.
- */
- }
- cell = NULL;
- }
+ p = net->cells.rb_node;
+ while (p) {
+ cell = rb_entry(p, struct afs_cell, net_node);
- } while (need_seqretry(&net->cells_lock, seq));
+ n = strncasecmp(cell->name, name,
+ min_t(size_t, cell->name_len, namesz));
+ if (n == 0)
+ n = cell->name_len - namesz;
+ if (n < 0)
+ p = p->rb_left;
+ else if (n > 0)
+ p = p->rb_right;
+ else
+ goto found;
+ }
- done_seqretry(&net->cells_lock, seq);
+ return ERR_PTR(-ENOENT);
- if (ret != 0 && cell)
- afs_put_cell(net, cell);
+found:
+ return afs_use_cell(cell, reason);
+}
- return ret == 0 ? cell : ERR_PTR(ret);
+/*
+ * Look up and get an activation reference on a cell record.
+ */
+struct afs_cell *afs_find_cell(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ enum afs_cell_trace reason)
+{
+ struct afs_cell *cell;
+
+ down_read(&net->cells_lock);
+ cell = afs_find_cell_locked(net, name, namesz, reason);
+ up_read(&net->cells_lock);
+ return cell;
}
/*
@@ -166,8 +158,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name[i] = tolower(name[i]);
cell->name[i] = 0;
- atomic_set(&cell->usage, 2);
- INIT_WORK(&cell->manager, afs_manage_cell);
+ atomic_set(&cell->ref, 1);
+ atomic_set(&cell->active, 0);
+ INIT_WORK(&cell->manager, afs_manage_cell_work);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
@@ -206,6 +199,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->dns_source = vllist->source;
cell->dns_status = vllist->status;
smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
+ atomic_inc(&net->cells_outstanding);
+ cell->debug_id = atomic_inc_return(&cell_debug_id);
+ trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc);
_leave(" = %p", cell);
return cell;
@@ -245,9 +241,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
_enter("%s,%s", name, vllist);
if (!excl) {
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(net, name, namesz);
- rcu_read_unlock();
+ cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup);
if (!IS_ERR(cell))
goto wait_for_cell;
}
@@ -268,7 +262,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
/* Find the insertion point and check to see if someone else added a
* cell whilst we were allocating.
*/
- write_seqlock(&net->cells_lock);
+ down_write(&net->cells_lock);
pp = &net->cells.rb_node;
parent = NULL;
@@ -290,23 +284,26 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
+ atomic_set(&cell->active, 2);
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
- atomic_inc(&net->cells_outstanding);
- write_sequnlock(&net->cells_lock);
+ up_write(&net->cells_lock);
- queue_work(afs_wq, &cell->manager);
+ afs_queue_cell(cell, afs_cell_trace_get_queue_new);
wait_for_cell:
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active),
+ afs_cell_trace_wait);
_debug("wait_for_cell");
wait_var_event(&cell->state,
({
state = smp_load_acquire(&cell->state); /* vs error */
- state == AFS_CELL_ACTIVE || state == AFS_CELL_FAILED;
+ state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED;
}));
/* Check the state obtained from the wait check. */
- if (state == AFS_CELL_FAILED) {
+ if (state == AFS_CELL_REMOVED) {
ret = cell->error;
goto error;
}
@@ -320,16 +317,17 @@ cell_already_exists:
if (excl) {
ret = -EEXIST;
} else {
- afs_get_cell(cursor);
+ afs_use_cell(cursor, afs_cell_trace_use_lookup);
ret = 0;
}
- write_sequnlock(&net->cells_lock);
- kfree(candidate);
+ up_write(&net->cells_lock);
+ if (candidate)
+ afs_put_cell(candidate, afs_cell_trace_put_candidate);
if (ret == 0)
goto wait_for_cell;
goto error_noput;
error:
- afs_put_cell(net, cell);
+ afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup);
error_noput:
_leave(" = %d [error]", ret);
return ERR_PTR(ret);
@@ -374,15 +372,16 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
}
if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags))
- afs_get_cell(new_root);
+ afs_use_cell(new_root, afs_cell_trace_use_pin);
/* install the new cell */
- write_seqlock(&net->cells_lock);
- old_root = rcu_access_pointer(net->ws_cell);
- rcu_assign_pointer(net->ws_cell, new_root);
- write_sequnlock(&net->cells_lock);
+ down_write(&net->cells_lock);
+ afs_see_cell(new_root, afs_cell_trace_see_ws);
+ old_root = net->ws_cell;
+ net->ws_cell = new_root;
+ up_write(&net->cells_lock);
- afs_put_cell(net, old_root);
+ afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws);
_leave(" = 0");
return 0;
}
@@ -488,18 +487,22 @@ out_wake:
static void afs_cell_destroy(struct rcu_head *rcu)
{
struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
+ struct afs_net *net = cell->net;
+ int u;
_enter("%p{%s}", cell, cell->name);
- ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+ u = atomic_read(&cell->ref);
+ ASSERTCMP(u, ==, 0);
+ trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free);
- afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
- afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
- afs_put_cell(cell->net, cell->alias_of);
+ afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
+ afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
key_put(cell->anonymous_key);
kfree(cell->name);
kfree(cell);
+ afs_dec_cells_outstanding(net);
_leave(" [destroyed]");
}
@@ -532,18 +535,63 @@ void afs_cells_timer(struct timer_list *timer)
/*
* Get a reference on a cell record.
*/
-struct afs_cell *afs_get_cell(struct afs_cell *cell)
+struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- atomic_inc(&cell->usage);
+ int u;
+
+ if (atomic_read(&cell->ref) <= 0)
+ BUG();
+
+ u = atomic_inc_return(&cell->ref);
+ trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason);
return cell;
}
/*
* Drop a reference on a cell record.
*/
-void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
+void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
+ if (cell) {
+ unsigned int debug_id = cell->debug_id;
+ unsigned int u, a;
+
+ a = atomic_read(&cell->active);
+ u = atomic_dec_return(&cell->ref);
+ trace_afs_cell(debug_id, u, a, reason);
+ if (u == 0) {
+ a = atomic_read(&cell->active);
+ WARN(a != 0, "Cell active count %u > 0\n", a);
+ call_rcu(&cell->rcu, afs_cell_destroy);
+ }
+ }
+}
+
+/*
+ * Note a cell becoming more active.
+ */
+struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ int u, a;
+
+ if (atomic_read(&cell->ref) <= 0)
+ BUG();
+
+ u = atomic_read(&cell->ref);
+ a = atomic_inc_return(&cell->active);
+ trace_afs_cell(cell->debug_id, u, a, reason);
+ return cell;
+}
+
+/*
+ * Record a cell becoming less active. When the active counter reaches 1, it
+ * is scheduled for destruction, but may get reactivated.
+ */
+void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ unsigned int debug_id = cell->debug_id;
time64_t now, expire_delay;
+ int u, a;
if (!cell)
return;
@@ -556,11 +604,35 @@ void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
if (cell->vl_servers->nr_servers)
expire_delay = afs_cell_gc_delay;
- if (atomic_dec_return(&cell->usage) > 1)
- return;
+ u = atomic_read(&cell->ref);
+ a = atomic_dec_return(&cell->active);
+ trace_afs_cell(debug_id, u, a, reason);
+ WARN_ON(a == 0);
+ if (a == 1)
+ /* 'cell' may now be garbage collected. */
+ afs_set_cell_timer(net, expire_delay);
+}
+
+/*
+ * Note that a cell has been seen.
+ */
+void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ int u, a;
+
+ u = atomic_read(&cell->ref);
+ a = atomic_read(&cell->active);
+ trace_afs_cell(cell->debug_id, u, a, reason);
+}
- /* 'cell' may now be garbage collected. */
- afs_set_cell_timer(net, expire_delay);
+/*
+ * Queue a cell for management, giving the workqueue a ref to hold.
+ */
+void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason)
+{
+ afs_get_cell(cell, reason);
+ if (!queue_work(afs_wq, &cell->manager))
+ afs_put_cell(cell, afs_cell_trace_put_queue_fail);
}
/*
@@ -660,12 +732,10 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
* Manage a cell record, initialising and destroying it, maintaining its DNS
* records.
*/
-static void afs_manage_cell(struct work_struct *work)
+static void afs_manage_cell(struct afs_cell *cell)
{
- struct afs_cell *cell = container_of(work, struct afs_cell, manager);
struct afs_net *net = cell->net;
- bool deleted;
- int ret, usage;
+ int ret, active;
_enter("%s", cell->name);
@@ -674,14 +744,19 @@ again:
switch (cell->state) {
case AFS_CELL_INACTIVE:
case AFS_CELL_FAILED:
- write_seqlock(&net->cells_lock);
- usage = 1;
- deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0);
- if (deleted)
+ down_write(&net->cells_lock);
+ active = 1;
+ if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
rb_erase(&cell->net_node, &net->cells);
- write_sequnlock(&net->cells_lock);
- if (deleted)
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0,
+ afs_cell_trace_unuse_delete);
+ smp_store_release(&cell->state, AFS_CELL_REMOVED);
+ }
+ up_write(&net->cells_lock);
+ if (cell->state == AFS_CELL_REMOVED) {
+ wake_up_var(&cell->state);
goto final_destruction;
+ }
if (cell->state == AFS_CELL_FAILED)
goto done;
smp_store_release(&cell->state, AFS_CELL_UNSET);
@@ -703,7 +778,7 @@ again:
goto again;
case AFS_CELL_ACTIVE:
- if (atomic_read(&cell->usage) > 1) {
+ if (atomic_read(&cell->active) > 1) {
if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
ret = afs_update_cell(cell);
if (ret < 0)
@@ -716,13 +791,16 @@ again:
goto again;
case AFS_CELL_DEACTIVATING:
- if (atomic_read(&cell->usage) > 1)
+ if (atomic_read(&cell->active) > 1)
goto reverse_deactivation;
afs_deactivate_cell(net, cell);
smp_store_release(&cell->state, AFS_CELL_INACTIVE);
wake_up_var(&cell->state);
goto again;
+ case AFS_CELL_REMOVED:
+ goto done;
+
default:
break;
}
@@ -748,9 +826,18 @@ done:
return;
final_destruction:
- call_rcu(&cell->rcu, afs_cell_destroy);
- afs_dec_cells_outstanding(net);
- _leave(" [destruct %d]", atomic_read(&net->cells_outstanding));
+ /* The root volume is pinning the cell */
+ afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
+ cell->root_volume = NULL;
+ afs_put_cell(cell, afs_cell_trace_put_destroy);
+}
+
+static void afs_manage_cell_work(struct work_struct *work)
+{
+ struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+
+ afs_manage_cell(cell);
+ afs_put_cell(cell, afs_cell_trace_put_queue_work);
}
/*
@@ -779,26 +866,29 @@ void afs_manage_cells(struct work_struct *work)
* lack of use and cells whose DNS results have expired and dispatch
* their managers.
*/
- read_seqlock_excl(&net->cells_lock);
+ down_read(&net->cells_lock);
for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
struct afs_cell *cell =
rb_entry(cursor, struct afs_cell, net_node);
- unsigned usage;
+ unsigned active;
bool sched_cell = false;
- usage = atomic_read(&cell->usage);
- _debug("manage %s %u", cell->name, usage);
+ active = atomic_read(&cell->active);
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ active, afs_cell_trace_manage);
- ASSERTCMP(usage, >=, 1);
+ ASSERTCMP(active, >=, 1);
if (purging) {
- if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- usage = atomic_dec_return(&cell->usage);
- ASSERTCMP(usage, ==, 1);
+ if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
+ active = atomic_dec_return(&cell->active);
+ trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ active, afs_cell_trace_unuse_pin);
+ }
}
- if (usage == 1) {
+ if (active == 1) {
struct afs_vlserver_list *vllist;
time64_t expire_at = cell->last_inactive;
@@ -821,10 +911,10 @@ void afs_manage_cells(struct work_struct *work)
}
if (sched_cell)
- queue_work(afs_wq, &cell->manager);
+ afs_queue_cell(cell, afs_cell_trace_get_queue_manage);
}
- read_sequnlock_excl(&net->cells_lock);
+ up_read(&net->cells_lock);
/* Update the timer on the way out. We have to pass an increment on
* cells_outstanding in the namespace that we are in to the timer or
@@ -854,11 +944,11 @@ void afs_cell_purge(struct afs_net *net)
_enter("");
- write_seqlock(&net->cells_lock);
- ws = rcu_access_pointer(net->ws_cell);
- RCU_INIT_POINTER(net->ws_cell, NULL);
- write_sequnlock(&net->cells_lock);
- afs_put_cell(net, ws);
+ down_write(&net->cells_lock);
+ ws = net->ws_cell;
+ net->ws_cell = NULL;
+ up_write(&net->cells_lock);
+ afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws);
_debug("del timer");
if (del_timer_sync(&net->cells_timer))
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index bef413818af7..a4e9e6e07e93 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -252,7 +252,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the FID array and its count in two steps */
- /* fall through */
+ fallthrough;
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, true);
@@ -271,7 +271,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
afs_extract_to_buf(call, call->count * 3 * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract FID array");
ret = afs_extract_data(call, true);
@@ -297,7 +297,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the callback array and its count in two steps */
- /* fall through */
+ fallthrough;
case 3:
_debug("extract CB count");
ret = afs_extract_data(call, true);
@@ -312,7 +312,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract discard %zu/%u",
iov_iter_count(call->iter), call->count2 * 3 * 4);
@@ -391,7 +391,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract UUID");
ret = afs_extract_data(call, false);
@@ -503,7 +503,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract UUID");
ret = afs_extract_data(call, false);
@@ -618,7 +618,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
call->unmarshall++;
/* extract the FID array and its count in two steps */
- /* Fall through */
+ fallthrough;
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, true);
@@ -637,7 +637,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract FID array");
ret = afs_extract_data(call, false);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index b79879aacc02..db832cc931c8 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -123,9 +123,9 @@ static int afs_probe_cell_name(struct dentry *dentry)
len--;
}
- cell = afs_lookup_cell_rcu(net, name, len);
+ cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe);
if (!IS_ERR(cell)) {
- afs_put_cell(net, cell);
+ afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe);
return 0;
}
@@ -179,7 +179,6 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry)
struct afs_cell *cell;
struct afs_net *net = afs_d2net(dentry);
struct dentry *ret;
- unsigned int seq = 0;
char *name;
int len;
@@ -191,17 +190,13 @@ static struct dentry *afs_lookup_atcell(struct dentry *dentry)
if (!name)
goto out_p;
- rcu_read_lock();
- do {
- read_seqbegin_or_lock(&net->cells_lock, &seq);
- cell = rcu_dereference_raw(net->ws_cell);
- if (cell) {
- len = cell->name_len;
- memcpy(name, cell->name, len + 1);
- }
- } while (need_seqretry(&net->cells_lock, seq));
- done_seqretry(&net->cells_lock, seq);
- rcu_read_unlock();
+ down_read(&net->cells_lock);
+ cell = net->ws_cell;
+ if (cell) {
+ len = cell->name_len;
+ memcpy(name, cell->name, len + 1);
+ }
+ up_read(&net->cells_lock);
ret = ERR_PTR(-ENOENT);
if (!cell)
@@ -382,15 +377,17 @@ void afs_dynroot_depopulate(struct super_block *sb)
net->dynroot_sb = NULL;
mutex_unlock(&net->proc_cells_lock);
- inode_lock(root->d_inode);
-
- /* Remove all the pins for dirs created for manually added cells */
- list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
- if (subdir->d_fsdata) {
- subdir->d_fsdata = NULL;
- dput(subdir);
+ if (root) {
+ inode_lock(root->d_inode);
+
+ /* Remove all the pins for dirs created for manually added cells */
+ list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+ if (subdir->d_fsdata) {
+ subdir->d_fsdata = NULL;
+ dput(subdir);
+ }
}
- }
- inode_unlock(root->d_inode);
+ inode_unlock(root->d_inode);
+ }
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6f6ed1605cfe..371d1488cc54 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -311,7 +311,7 @@ int afs_page_filler(void *data, struct page *page)
case -ENOBUFS:
_debug("cache said ENOBUFS");
- /* fall through */
+ fallthrough;
default:
go_on:
req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index ffb8575345ca..cb3054c7843e 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -376,7 +376,6 @@ again:
spin_unlock(&vnode->lock);
return;
- /* Fall through */
default:
/* Looks like a lock request was withdrawn. */
spin_unlock(&vnode->lock);
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 24fd163c6323..97cab12b0a6c 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -235,6 +235,7 @@ int afs_put_operation(struct afs_operation *op)
afs_end_cursor(&op->ac);
afs_put_serverlist(op->net, op->server_list);
afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op);
+ key_put(op->key);
kfree(op);
return ret;
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 5d9ef517cf81..e7e98ad63a91 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -161,8 +161,8 @@ responded:
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index acb4d0ca2649..1d95ed9dd86e 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -320,7 +320,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->tmp_u = htonl(0);
afs_extract_to_tmp(call);
}
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -348,7 +348,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->bvec[0].bv_page = req->pages[req->index];
iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -375,7 +375,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
/* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len);
call->unmarshall = 3;
- /* Fall through */
+ fallthrough;
case 3:
_debug("extract discard %zu/%llu",
@@ -388,7 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
no_more_data:
call->unmarshall = 4;
afs_extract_to_buf(call, (21 + 3 + 6) * 4);
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 4:
@@ -1343,7 +1343,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
case 0:
call->unmarshall++;
afs_extract_to_buf(call, 12 * 4);
- /* Fall through */
+ fallthrough;
/* extract the returned status record */
case 1:
@@ -1356,7 +1356,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs);
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* extract the volume name length */
case 2:
@@ -1371,7 +1371,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the volume name */
case 3:
@@ -1385,7 +1385,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("volname '%s'", p);
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message length */
case 4:
@@ -1400,7 +1400,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message */
case 5:
@@ -1415,7 +1415,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day length */
case 6:
@@ -1430,7 +1430,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day */
case 7:
@@ -1682,7 +1682,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the capabilities word count */
case 1:
@@ -1696,7 +1696,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
call->count2 = count;
afs_extract_discard(call, count * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract capabilities words */
case 2:
@@ -1776,7 +1776,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file status count and array in two steps */
case 1:
@@ -1794,7 +1794,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_counts:
afs_extract_to_buf(call, 21 * sizeof(__be32));
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract status array %u", call->count);
@@ -1824,7 +1824,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* Extract the callback count and array in two steps */
case 3:
@@ -1841,7 +1841,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_cbs:
afs_extract_to_buf(call, 3 * sizeof(__be32));
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract CB array");
@@ -1870,7 +1870,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
afs_extract_to_buf(call, 6 * sizeof(__be32));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
ret = afs_extract_data(call, false);
@@ -1974,7 +1974,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -1992,7 +1992,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
acl->size = call->count2;
afs_extract_begin(call, acl->data, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -2002,7 +2002,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
afs_extract_to_buf(call, (21 + 6) * 4);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 3:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1d13d2e882ad..0fe8844b4bee 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
- struct inode *inode = &op->file[0].vnode->vfs_inode;
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
+ loff_t old_i_size = i_size_read(inode);
+
+ op->setattr.old_i_size = old_i_size;
+ afs_vnode_commit_status(op, vp);
+ /* inode->i_size has now been changed. */
+
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t size = op->setattr.attr->ia_size;
+ if (size > old_i_size)
+ pagecache_isize_extended(inode, old_i_size, size);
+ }
+}
+
+static void afs_setattr_edit_file(struct afs_operation *op)
+{
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
- afs_vnode_commit_status(op, &op->file[0]);
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
- loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
- if (size > i_size)
- pagecache_isize_extended(inode, i_size, size);
- truncate_pagecache(inode, size);
+ loff_t size = op->setattr.attr->ia_size;
+ loff_t i_size = op->setattr.old_i_size;
+
+ if (size < i_size)
+ truncate_pagecache(inode, size);
}
}
@@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = {
.issue_afs_rpc = afs_fs_setattr,
.issue_yfs_rpc = yfs_fs_setattr,
.success = afs_setattr_success,
+ .edit_dir = afs_setattr_edit_file,
};
/*
@@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ /* Prevent any new writebacks from starting whilst we do this. */
+ down_write(&vnode->validate_lock);
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
- if (IS_ERR(op))
- return PTR_ERR(op);
+ if (IS_ERR(op)) {
+ ret = PTR_ERR(op);
+ goto out_unlock;
+ }
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
@@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
- return afs_do_sync_operation(op);
+ ret = afs_do_sync_operation(op);
+
+out_unlock:
+ up_write(&vnode->validate_lock);
+ _leave(" = %d", ret);
+ return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 792ac711985e..81b0485fd22a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -263,11 +263,11 @@ struct afs_net {
/* Cell database */
struct rb_root cells;
- struct afs_cell __rcu *ws_cell;
+ struct afs_cell *ws_cell;
struct work_struct cells_manager;
struct timer_list cells_timer;
atomic_t cells_outstanding;
- seqlock_t cells_lock;
+ struct rw_semaphore cells_lock;
struct mutex cells_alias_lock;
struct mutex proc_cells_lock;
@@ -326,6 +326,7 @@ enum afs_cell_state {
AFS_CELL_DEACTIVATING,
AFS_CELL_INACTIVE,
AFS_CELL_FAILED,
+ AFS_CELL_REMOVED,
};
/*
@@ -363,7 +364,8 @@ struct afs_cell {
#endif
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
- atomic_t usage;
+ atomic_t ref; /* Struct refcount */
+ atomic_t active; /* Active usage counter */
unsigned long flags;
#define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */
#define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */
@@ -373,6 +375,7 @@ struct afs_cell {
enum dns_record_source dns_source:8; /* Latest source of data from lookup */
enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */
unsigned int dns_lookup_count; /* Counter of DNS lookups */
+ unsigned int debug_id;
/* The volumes belonging to this cell */
struct rb_root volumes; /* Tree of volumes on this server */
@@ -401,22 +404,24 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */
#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
+#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
atomic_t usage;
+ unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT as ktime/64 */
+ unsigned int rtt; /* RTT in uS */
u32 abort_code;
short error;
- bool have_result;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
+ unsigned short flags;
+#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */
+#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */
+#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */
+#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
u16 port;
@@ -810,6 +815,7 @@ struct afs_operation {
} store;
struct {
struct iattr *attr;
+ loff_t old_i_size;
} setattr;
struct afs_acl *acl;
struct yfs_acl *yacl;
@@ -914,11 +920,16 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
* cell.c
*/
extern int afs_cell_init(struct afs_net *, const char *);
-extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned);
+extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
+ enum afs_cell_trace);
extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
const char *, bool);
-extern struct afs_cell *afs_get_cell(struct afs_cell *);
-extern void afs_put_cell(struct afs_net *, struct afs_cell *);
+extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace);
+extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace);
+extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_manage_cells(struct work_struct *);
extern void afs_cells_timer(struct timer_list *);
extern void __net_exit afs_cell_purge(struct afs_net *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 31b472f7c734..accdd8970e7c 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -78,7 +78,7 @@ static int __net_init afs_net_init(struct net *net_ns)
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
- seqlock_init(&net->cells_lock);
+ init_rwsem(&net->cells_lock);
INIT_WORK(&net->cells_manager, afs_manage_cells);
timer_setup(&net->cells_timer, afs_cells_timer, 0);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 5334f1bd2bca..1d1a8debe472 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -120,42 +120,42 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
if (e->error == -ETIMEDOUT ||
e->error == -ETIME)
return;
- /* Fall through */
+ fallthrough;
case -ETIMEDOUT:
case -ETIME:
if (e->error == -ENOMEM ||
e->error == -ENONET)
return;
- /* Fall through */
+ fallthrough;
case -ENOMEM:
case -ENONET:
if (e->error == -ERFKILL)
return;
- /* Fall through */
+ fallthrough;
case -ERFKILL:
if (e->error == -EADDRNOTAVAIL)
return;
- /* Fall through */
+ fallthrough;
case -EADDRNOTAVAIL:
if (e->error == -ENETUNREACH)
return;
- /* Fall through */
+ fallthrough;
case -ENETUNREACH:
if (e->error == -EHOSTUNREACH)
return;
- /* Fall through */
+ fallthrough;
case -EHOSTUNREACH:
if (e->error == -EHOSTDOWN)
return;
- /* Fall through */
+ fallthrough;
case -EHOSTDOWN:
if (e->error == -ECONNREFUSED)
return;
- /* Fall through */
+ fallthrough;
case -ECONNREFUSED:
if (e->error == -ECONNRESET)
return;
- /* Fall through */
+ fallthrough;
case -ECONNRESET: /* Responded, but call expired. */
if (e->responded)
return;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 79bc5f1338ed..052dab2f5c03 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -88,7 +88,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ctx->force = true;
}
if (ctx->cell) {
- afs_put_cell(ctx->net, ctx->cell);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt);
ctx->cell = NULL;
}
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
@@ -124,7 +124,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
char *buf;
if (src_as->cell)
- ctx->cell = afs_get_cell(src_as->cell);
+ ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt);
if (size < 2 || size > PAGE_SIZE - 1)
return -EINVAL;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e817fc740ba0..065a28bfa3f1 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -38,7 +38,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
if (v == SEQ_START_TOKEN) {
/* display header on line 1 */
- seq_puts(m, "USE TTL SV ST NAME\n");
+ seq_puts(m, "USE ACT TTL SV ST NAME\n");
return 0;
}
@@ -46,10 +46,11 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
vllist = rcu_dereference(cell->vl_servers);
/* display one cell per line on subsequent lines */
- seq_printf(m, "%3u %6lld %2u %2u %s\n",
- atomic_read(&cell->usage),
+ seq_printf(m, "%3u %3u %6lld %2u %2u %s\n",
+ atomic_read(&cell->ref),
+ atomic_read(&cell->active),
cell->dns_expiry - ktime_get_real_seconds(),
- vllist->nr_servers,
+ vllist ? vllist->nr_servers : 0,
cell->state,
cell->name);
return 0;
@@ -128,7 +129,7 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
}
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- afs_put_cell(net, cell);
+ afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin);
} else {
goto inval;
}
@@ -154,13 +155,11 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v)
struct afs_net *net;
net = afs_seq2net_single(m);
- if (rcu_access_pointer(net->ws_cell)) {
- rcu_read_lock();
- cell = rcu_dereference(net->ws_cell);
- if (cell)
- seq_printf(m, "%s\n", cell->name);
- rcu_read_unlock();
- }
+ down_read(&net->cells_lock);
+ cell = net->ws_cell;
+ if (cell)
+ seq_printf(m, "%s\n", cell->name);
+ up_read(&net->cells_lock);
return 0;
}
@@ -310,6 +309,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
alist->preferred == i ? '>' : '-',
&alist->addrs[i].transport);
}
+ seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+ seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+ vlserver->probe.flags, vlserver->probe.error,
+ vlserver->probe.abort_code,
+ atomic_read(&vlserver->probe_outstanding));
return 0;
}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 6a0935cb822f..d83f13c44b92 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_operation *op)
case -ETIME:
if (op->error != -EDESTADDRREQ)
goto iterate_address;
- /* Fall through */
+ fallthrough;
case -ERFKILL:
case -EADDRNOTAVAIL:
case -ENETUNREACH:
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8fc8fb406a5a..8be709cb8542 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -568,7 +568,7 @@ static void afs_deliver_to_call(struct afs_call *call)
case -EIO:
pr_err("kAFS: Call %u in bad state %u\n",
call->debug_id, state);
- /* Fall through */
+ fallthrough;
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
@@ -669,7 +669,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
ret = call->ret0;
call->ret0 = 0;
- /* Fall through */
+ fallthrough;
case -ECONNABORTED:
ac->responded = true;
break;
@@ -872,7 +872,7 @@ void afs_send_empty_reply(struct afs_call *call)
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
RX_USER_ABORT, -ENOMEM, "KOO");
- /* Fall through */
+ fallthrough;
default:
_leave(" [error]");
return;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index e82e452e2612..684a2b02b9ff 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -550,7 +550,12 @@ void afs_manage_servers(struct work_struct *work)
_debug("manage %pU %u", &server->uuid, active);
- ASSERTIFCMP(purging, active, ==, 0);
+ if (purging) {
+ trace_afs_server(server, atomic_read(&server->ref),
+ active, afs_server_trace_purging);
+ if (active != 0)
+ pr_notice("Can't purge s=%08x\n", server->debug_id);
+ }
if (active == 0) {
time64_t expire_at = server->unuse_time;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b552357b1d13..6c5900df6aa5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -294,7 +294,8 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_put_cell(ctx->net, ctx->cell);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse);
+ afs_see_cell(cell, afs_cell_trace_see_source);
ctx->cell = cell;
}
@@ -389,8 +390,9 @@ static int afs_validate_fc(struct fs_context *fc)
_debug("switch to alias");
key_put(ctx->key);
ctx->key = NULL;
- cell = afs_get_cell(ctx->cell->alias_of);
- afs_put_cell(ctx->net, ctx->cell);
+ cell = afs_use_cell(ctx->cell->alias_of,
+ afs_cell_trace_use_fc_alias);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
ctx->cell = cell;
goto reget_key;
}
@@ -456,7 +458,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
@@ -508,7 +509,7 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
if (ctx->dyn_root) {
as->dyn_root = true;
} else {
- as->cell = afs_get_cell(ctx->cell);
+ as->cell = afs_use_cell(ctx->cell, afs_cell_trace_use_sbi);
as->volume = afs_get_volume(ctx->volume,
afs_volume_trace_get_alloc_sbi);
}
@@ -521,7 +522,7 @@ static void afs_destroy_sbi(struct afs_super_info *as)
if (as) {
struct afs_net *net = afs_net(as->net_ns);
afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi);
- afs_put_cell(net, as->cell);
+ afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
}
@@ -607,7 +608,7 @@ static void afs_free_fc(struct fs_context *fc)
afs_destroy_sbi(fc->s_fs_info);
afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc);
- afs_put_cell(ctx->net, ctx->cell);
+ afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
}
@@ -634,9 +635,7 @@ static int afs_init_fs_context(struct fs_context *fc)
ctx->net = afs_net(fc->net_ns);
/* Default to the workstation cell. */
- rcu_read_lock();
- cell = afs_lookup_cell_rcu(ctx->net, NULL, 0);
- rcu_read_unlock();
+ cell = afs_find_cell(ctx->net, NULL, 0, afs_cell_trace_use_fc);
if (IS_ERR(cell))
cell = NULL;
ctx->cell = cell;
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 5082ef04e99c..f04a80e4f5c3 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -177,7 +177,7 @@ static int afs_compare_cell_roots(struct afs_cell *cell)
is_alias:
rcu_read_unlock();
- cell->alias_of = afs_get_cell(p);
+ cell->alias_of = afs_use_cell(p, afs_cell_trace_use_alias);
return 1;
}
@@ -247,18 +247,18 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key)
continue;
if (p->root_volume)
continue; /* Ignore cells that have a root.cell volume. */
- afs_get_cell(p);
+ afs_use_cell(p, afs_cell_trace_use_check_alias);
mutex_unlock(&cell->net->proc_cells_lock);
if (afs_query_for_alias_one(cell, key, p) != 0)
goto is_alias;
if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) {
- afs_put_cell(cell->net, p);
+ afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
return -ERESTARTSYS;
}
- afs_put_cell(cell->net, p);
+ afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
}
mutex_unlock(&cell->net->proc_cells_lock);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 8fea54eba0c2..38b2ba1d9ec0 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index e3aa013c2177..d1c7068b4346 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -11,15 +11,33 @@
#include "internal.h"
#include "protocol_yfs.h"
-static bool afs_vl_probe_done(struct afs_vlserver *server)
+
+/*
+ * Handle the completion of a set of probes.
+ */
+static void afs_finished_vl_probe(struct afs_vlserver *server)
{
- if (!atomic_dec_and_test(&server->probe_outstanding))
- return false;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
+ server->rtt = UINT_MAX;
+ clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ }
- wake_up_var(&server->probe_outstanding);
clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
- return true;
+}
+
+/*
+ * Handle the completion of a probe RPC call.
+ */
+static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
+{
+ if (atomic_dec_and_test(&server->probe_outstanding)) {
+ afs_finished_vl_probe(server);
+ wake_up = true;
+ }
+
+ if (wake_up)
+ wake_up_all(&server->probe_wq);
}
/*
@@ -45,15 +63,20 @@ void afs_vlserver_probe_result(struct afs_call *call)
server->probe.error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
server->probe.abort_code = call->abort_code;
server->probe.error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- server->probe.local_failure = true;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ case -EKEYEXPIRED:
+ case -EKEYREVOKED:
+ case -EKEYREJECTED:
+ server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE;
+ if (server->probe.error == 0)
+ server->probe.error = ret;
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
case -ERFKILL:
@@ -67,12 +90,12 @@ void afs_vlserver_probe_result(struct afs_call *call)
default:
clear_bit(index, &alist->responded);
set_bit(index, &alist->failed);
- if (!server->probe.responded &&
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
server->probe.error == -ETIME))
server->probe.error = ret;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
}
@@ -81,39 +104,36 @@ responded:
clear_bit(index, &alist->failed);
if (call->service_id == YFS_VL_SERVICE) {
- server->probe.is_yfs = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
+ server->rtt = rtt_us;
alist->preferred = index;
- have_result = true;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED;
set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+ set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ have_result = true;
out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
server_index, index, &alist->addrs[index].transport, rtt_us, ret);
- have_result |= afs_vl_probe_done(server);
- if (have_result) {
- server->probe.have_result = true;
- wake_up_var(&server->probe.have_result);
- wake_up_all(&server->probe_wq);
- }
+ afs_done_one_vl_probe(server, have_result);
}
/*
@@ -151,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
in_progress = true;
} else {
afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_done_one_vl_probe(server, false);
}
}
- if (!in_progress)
- afs_vl_probe_done(server);
return in_progress;
}
@@ -193,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
{
struct wait_queue_entry *waits;
struct afs_vlserver *server;
- unsigned int rtt = UINT_MAX;
+ unsigned int rtt = UINT_MAX, rtt_s;
bool have_responders = false;
int pref = -1, i;
@@ -205,7 +224,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
server = vllist->servers[i].server;
if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
__clear_bit(i, &untried);
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
have_responders = true;
}
}
@@ -231,7 +250,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
goto stop;
if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
still_probing = true;
@@ -249,10 +268,11 @@ stop:
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded &&
- server->probe.rtt < rtt) {
+ rtt_s = READ_ONCE(server->rtt);
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) &&
+ rtt_s < rtt) {
pref = i;
- rtt = server->probe.rtt;
+ rtt = rtt_s;
}
remove_wait_queue(&server->probe_wq, &waits[i]);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index f405ca8b240a..488e58490b16 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -45,7 +45,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
cell->dns_expiry <= ktime_get_real_seconds()) {
dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
- queue_work(afs_wq, &cell->manager);
+ afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
if (wait_var_event_interruptible(
@@ -192,7 +192,8 @@ pick_server:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) || !s->probe.responded)
+ if (!test_bit(i, &vc->untried) ||
+ !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
if (s->probe.rtt < rtt) {
vc->index = i;
@@ -262,10 +263,14 @@ no_more_servers:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
+ e.responded = true;
afs_prioritise_error(&e, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
+ error = e.error;
+
failed_set_error:
vc->error = error;
failed:
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index fd82850cd424..dc9327332f06 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -196,7 +196,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
/* Extract the returned uuid, uniquifier, nentries and
* blkaddrs size */
- /* Fall through */
+ fallthrough;
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -221,7 +221,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
count = min(call->count, 4U);
afs_extract_to_buf(call, count * sizeof(__be32));
- /* Fall through - and extract entries */
+ fallthrough; /* and extract entries */
case 2:
ret = afs_extract_data(call, call->count > 4);
if (ret < 0)
@@ -324,7 +324,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through - and extract the capabilities word count */
+ fallthrough; /* and extract the capabilities word count */
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -337,7 +337,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
call->unmarshall++;
afs_extract_discard(call, count * sizeof(__be32));
- /* Fall through - and extract capabilities words */
+ fallthrough; /* and extract capabilities words */
case 2:
ret = afs_extract_data(call, false);
if (ret < 0)
@@ -436,7 +436,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
/* Extract the returned uuid, uniquifier, fsEndpoints count and
* either the first fsEndpoint type or the volEndpoints
* count if there are no fsEndpoints. */
- /* Fall through */
+ fallthrough;
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -475,7 +475,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall = 2;
- /* Fall through - and extract fsEndpoints[] entries */
+ fallthrough; /* and extract fsEndpoints[] entries */
case 2:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -526,7 +526,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
* extract the type of the next endpoint when we extract the
* data of the current one, but this is the first...
*/
- /* Fall through */
+ fallthrough;
case 3:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -552,7 +552,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_to_buf(call, size);
call->unmarshall = 4;
- /* Fall through - and extract volEndpoints[] entries */
+ fallthrough; /* and extract volEndpoints[] entries */
case 4:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -587,7 +587,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
afs_extract_discard(call, 0);
call->unmarshall = 5;
- /* Fall through - Done */
+ fallthrough; /* Done */
case 5:
ret = afs_extract_data(call, false);
if (ret < 0)
@@ -663,7 +663,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through - and extract the cell name length */
+ fallthrough; /* and extract the cell name length */
case 1:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -685,7 +685,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_begin(call, cell_name, namesz);
call->unmarshall++;
- /* Fall through - and extract cell name */
+ fallthrough; /* and extract cell name */
case 2:
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -694,7 +694,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
afs_extract_discard(call, call->count2);
call->unmarshall++;
- /* Fall through - and extract padding */
+ fallthrough; /* and extract padding */
case 3:
ret = afs_extract_data(call, false);
if (ret < 0)
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 9bc0509e3634..f84194b791d3 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -83,7 +83,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->vid = vldb->vid[params->type];
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
- volume->cell = afs_get_cell(params->cell);
+ volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol);
volume->type = params->type;
volume->type_force = params->force;
volume->name_len = vldb->name_len;
@@ -106,7 +106,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
return volume;
error_1:
- afs_put_cell(params->net, volume->cell);
+ afs_put_cell(volume->cell, afs_cell_trace_put_vol);
kfree(volume);
error_0:
return ERR_PTR(ret);
@@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(net, rcu_access_pointer(volume->servers));
- afs_put_cell(net, volume->cell);
+ afs_put_cell(volume->cell, afs_cell_trace_put_vol);
trace_afs_volume(volume->vid, atomic_read(&volume->usage),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a121c247d95a..da12abd6db21 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -609,7 +609,7 @@ no_more:
default:
pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- /* Fall through */
+ fallthrough;
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping,
int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
pgoff_t start, end, next;
int ret;
_enter("");
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ down_read(&vnode->validate_lock);
+ else if (!down_read_trylock(&vnode->validate_lock))
+ return 0;
+
if (wbc->range_cyclic) {
start = mapping->writeback_index;
end = -1;
@@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping,
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
+ up_read(&vnode->validate_lock);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 8c24fdc899e3..3b1239b7e90d 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -373,7 +373,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->offset = req->pos & (PAGE_SIZE - 1);
afs_extract_to_tmp64(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the returned data length */
case 1:
@@ -401,7 +401,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
call->bvec[0].bv_page = req->pages[req->index];
iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
- /* Fall through */
+ fallthrough;
/* extract the returned data */
case 2:
@@ -428,7 +428,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
/* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len);
call->unmarshall = 3;
- /* Fall through */
+ fallthrough;
case 3:
_debug("extract discard %zu/%llu",
@@ -444,7 +444,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
sizeof(struct yfs_xdr_YFSFetchStatus) +
sizeof(struct yfs_xdr_YFSCallBack) +
sizeof(struct yfs_xdr_YFSVolSync));
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 4:
@@ -461,7 +461,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->file_size = vp->scb.status.size;
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
break;
@@ -1262,7 +1262,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
case 0:
call->unmarshall++;
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
- /* Fall through */
+ fallthrough;
/* extract the returned status record */
case 1:
@@ -1275,7 +1275,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs);
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* extract the volume name length */
case 2:
@@ -1290,7 +1290,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the volume name */
case 3:
@@ -1304,7 +1304,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("volname '%s'", p);
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message length */
case 4:
@@ -1319,7 +1319,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the offline message */
case 5:
@@ -1334,7 +1334,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day length */
case 6:
@@ -1349,7 +1349,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
size = (call->count + 3) & ~3; /* It's padded */
afs_extract_to_buf(call, size);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the message of the day */
case 7:
@@ -1363,7 +1363,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("motd '%s'", p);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 8:
break;
@@ -1622,7 +1622,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file status count and array in two steps */
case 1:
@@ -1640,7 +1640,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_counts:
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
- /* Fall through */
+ fallthrough;
case 2:
_debug("extract status array %u", call->count);
@@ -1670,7 +1670,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
afs_extract_to_tmp(call);
- /* Fall through */
+ fallthrough;
/* Extract the callback count and array in two steps */
case 3:
@@ -1687,7 +1687,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->unmarshall++;
more_cbs:
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
- /* Fall through */
+ fallthrough;
case 4:
_debug("extract CB array");
@@ -1716,7 +1716,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 5:
ret = afs_extract_data(call, false);
@@ -1727,7 +1727,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
xdr_decode_YFSVolSync(&bp, &op->volsync);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 6:
break;
@@ -1804,7 +1804,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
case 0:
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file ACL length */
case 1:
@@ -1826,7 +1826,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_discard(call, size);
}
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the file ACL */
case 2:
@@ -1836,7 +1836,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the volume ACL length */
case 3:
@@ -1858,7 +1858,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
afs_extract_discard(call, size);
}
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* Extract the volume ACL */
case 4:
@@ -1871,7 +1871,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
sizeof(struct yfs_xdr_YFSFetchStatus) +
sizeof(struct yfs_xdr_YFSVolSync));
call->unmarshall++;
- /* Fall through */
+ fallthrough;
/* extract the metadata */
case 5:
@@ -1886,7 +1886,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
xdr_decode_YFSVolSync(&bp, &op->volsync);
call->unmarshall++;
- /* Fall through */
+ fallthrough;
case 6:
break;
diff --git a/fs/aio.c b/fs/aio.c
index 5736bff48e9e..c45c20d87538 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
*iovec = NULL;
return ret;
}
-#ifdef CONFIG_COMPAT
- if (compat)
- return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
- iter);
-#endif
- return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
+
+ return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
}
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
@@ -1511,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
* may be already running. Just fail this IO with EINTR.
*/
ret = -EINTR;
- /*FALLTHRU*/
+ fallthrough;
default:
req->ki_complete(req, ret, 0);
}
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 75105f45c51a..322b7dfb4ea0 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -8,6 +8,7 @@
#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/magic.h>
+#include <linux/nospec.h>
#include "autofs_i.h"
@@ -563,7 +564,7 @@ out:
static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
{
- static ioctl_fn _ioctls[] = {
+ static const ioctl_fn _ioctls[] = {
autofs_dev_ioctl_version,
autofs_dev_ioctl_protover,
autofs_dev_ioctl_protosubver,
@@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
};
unsigned int idx = cmd_idx(cmd);
- return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx];
+ if (idx >= ARRAY_SIZE(_ioctls))
+ return NULL;
+ idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls));
+ return _ioctls[idx];
}
/* ioctl dispatcher */
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 74c886f7c51c..5ced859dac53 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = kernel_write(file, data, bytes, &file->f_pos);
+ wr = __kernel_write(file, data, bytes, NULL);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 13d053982dd7..b6b3d052ca86 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/errno.h>
@@ -309,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
* Grow the stack manually; some architectures have a limit on how
* far ahead a user-space access may be in order to grow the stack.
*/
+ if (mmap_read_lock_killable(mm))
+ return -EINTR;
vma = find_extend_vma(mm, bprm->p);
+ mmap_read_unlock(mm);
if (!vma)
return -EFAULT;
@@ -421,6 +425,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
return 0;
}
+static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
+{
+ unsigned long alignment = 0;
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (cmds[i].p_type == PT_LOAD) {
+ unsigned long p_align = cmds[i].p_align;
+
+ /* skip non-power of two alignments as invalid */
+ if (!is_power_of_2(p_align))
+ continue;
+ alignment = max(alignment, p_align);
+ }
+ }
+
+ /* ensure we align to at least one page */
+ return ELF_PAGEALIGN(alignment);
+}
+
/**
* load_elf_phdrs() - load ELF program headers
* @elf_ex: ELF header of the binary whose program headers should be loaded
@@ -1008,6 +1032,7 @@ out_free_interp:
int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
+ unsigned long alignment;
if (elf_ppnt->p_type != PT_LOAD)
continue;
@@ -1086,6 +1111,9 @@ out_free_interp:
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+ if (alignment)
+ load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED;
} else
load_bias = 0;
@@ -1389,126 +1417,6 @@ out:
* Jeremy Fitzhardinge <jeremy@sw.oz.au>
*/
-/*
- * The purpose of always_dump_vma() is to make sure that special kernel mappings
- * that are useful for post-mortem analysis are included in every core dump.
- * In that way we ensure that the core dump is fully interpretable later
- * without matching up the same kernel and hardware config to see what PC values
- * meant. These special mappings include - vDSO, vsyscall, and other
- * architecture specific mappings
- */
-static bool always_dump_vma(struct vm_area_struct *vma)
-{
- /* Any vsyscall mappings? */
- if (vma == get_gate_vma(vma->vm_mm))
- return true;
-
- /*
- * Assume that all vmas with a .name op should always be dumped.
- * If this changes, a new vm_ops field can easily be added.
- */
- if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
- return true;
-
- /*
- * arch_vma_name() returns non-NULL for special architecture mappings,
- * such as vDSO sections.
- */
- if (arch_vma_name(vma))
- return true;
-
- return false;
-}
-
-/*
- * Decide what to dump of a segment, part, all or none.
- */
-static unsigned long vma_dump_size(struct vm_area_struct *vma,
- unsigned long mm_flags)
-{
-#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
-
- /* always dump the vdso and vsyscall sections */
- if (always_dump_vma(vma))
- goto whole;
-
- if (vma->vm_flags & VM_DONTDUMP)
- return 0;
-
- /* support for DAX */
- if (vma_is_dax(vma)) {
- if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
- goto whole;
- if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
- goto whole;
- return 0;
- }
-
- /* Hugetlb memory check */
- if (is_vm_hugetlb_page(vma)) {
- if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
- goto whole;
- if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
- goto whole;
- return 0;
- }
-
- /* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & VM_IO)
- return 0;
-
- /* By default, dump shared memory if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED) {
- if (file_inode(vma->vm_file)->i_nlink == 0 ?
- FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
- goto whole;
- return 0;
- }
-
- /* Dump segments that have been written to. */
- if (vma->anon_vma && FILTER(ANON_PRIVATE))
- goto whole;
- if (vma->vm_file == NULL)
- return 0;
-
- if (FILTER(MAPPED_PRIVATE))
- goto whole;
-
- /*
- * If this looks like the beginning of a DSO or executable mapping,
- * check for an ELF header. If we find one, dump the first page to
- * aid in determining what was mapped here.
- */
- if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
- u32 __user *header = (u32 __user *) vma->vm_start;
- u32 word;
- /*
- * Doing it this way gets the constant folded by GCC.
- */
- union {
- u32 cmp;
- char elfmag[SELFMAG];
- } magic;
- BUILD_BUG_ON(SELFMAG != sizeof word);
- magic.elfmag[EI_MAG0] = ELFMAG0;
- magic.elfmag[EI_MAG1] = ELFMAG1;
- magic.elfmag[EI_MAG2] = ELFMAG2;
- magic.elfmag[EI_MAG3] = ELFMAG3;
- if (unlikely(get_user(word, header)))
- word = 0;
- if (word == magic.cmp)
- return PAGE_SIZE;
- }
-
-#undef FILTER
-
- return 0;
-
-whole:
- return vma->vm_end - vma->vm_start;
-}
-
/* An ELF note in memory */
struct memelfnote
{
@@ -2220,32 +2128,6 @@ static void free_note_info(struct elf_note_info *info)
#endif
-static struct vm_area_struct *first_vma(struct task_struct *tsk,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret = tsk->mm->mmap;
-
- if (ret)
- return ret;
- return gate_vma;
-}
-/*
- * Helper function for iterating across a vma list. It ensures that the caller
- * will visit `gate_vma' prior to terminating the search.
- */
-static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret;
-
- ret = this_vma->vm_next;
- if (ret)
- return ret;
- if (this_vma == gate_vma)
- return NULL;
- return gate_vma;
-}
-
static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
elf_addr_t e_shoff, int segs)
{
@@ -2272,9 +2154,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int segs, i;
- size_t vma_data_size = 0;
- struct vm_area_struct *vma, *gate_vma;
+ int vma_count, segs, i;
+ size_t vma_data_size;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2282,30 +2163,16 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- elf_addr_t *vma_filesz = NULL;
+ struct core_vma_metadata *vma_meta;
+
+ if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+ return 0;
- /*
- * We no longer stop all VM operations.
- *
- * This is because those proceses that could possibly change map_count
- * or the mmap / vma pages are now blocked in do_exit on current
- * finishing this core dump.
- *
- * Only ptrace can touch these memory addresses, but it doesn't change
- * the map_count or the pages allocated. So no possibility of crashing
- * exists while dumping the mm->vm_next areas to the core file.
- */
-
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = current->mm->map_count;
- segs += elf_core_extra_phdrs();
-
- gate_vma = get_gate_vma(current->mm);
- if (gate_vma != NULL)
- segs++;
+ segs = vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2343,24 +2210,6 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- /*
- * Zero vma process will get ZERO_SIZE_PTR here.
- * Let coredump continue for register state at least.
- */
- vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)),
- GFP_KERNEL);
- if (!vma_filesz)
- goto end_coredump;
-
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
- unsigned long dump_size;
-
- dump_size = vma_dump_size(vma, cprm->mm_flags);
- vma_filesz[i++] = dump_size;
- vma_data_size += dump_size;
- }
-
offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2381,21 +2230,23 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
- phdr.p_vaddr = vma->vm_start;
+ phdr.p_vaddr = meta->start;
phdr.p_paddr = 0;
- phdr.p_filesz = vma_filesz[i++];
- phdr.p_memsz = vma->vm_end - vma->vm_start;
+ phdr.p_filesz = meta->dump_size;
+ phdr.p_memsz = meta->end - meta->start;
offset += phdr.p_filesz;
- phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
- if (vma->vm_flags & VM_WRITE)
+ phdr.p_flags = 0;
+ if (meta->flags & VM_READ)
+ phdr.p_flags |= PF_R;
+ if (meta->flags & VM_WRITE)
phdr.p_flags |= PF_W;
- if (vma->vm_flags & VM_EXEC)
+ if (meta->flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
@@ -2417,28 +2268,11 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
- unsigned long addr;
- unsigned long end;
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
- end = vma->vm_start + vma_filesz[i++];
-
- for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
- struct page *page;
- int stop;
-
- page = get_dump_page(addr);
- if (page) {
- void *kaddr = kmap(page);
- stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
- put_page(page);
- } else
- stop = !dump_skip(cprm, PAGE_SIZE);
- if (stop)
- goto end_coredump;
- }
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
+ goto end_coredump;
}
dump_truncate(cprm);
@@ -2453,7 +2287,7 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_filesz);
+ kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 50f845702b92..be4062b8ba75 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic
int pr_fpvalid; /* True if math co-processor being used. */
};
-/*
- * Decide whether a segment is worth dumping; default is yes to be
- * sure (missing info is worse than too much; etc).
- * Personally I'd include everything, and use the coredump limit...
- *
- * I think we should skip something. But I am not sure how. H.J.
- */
-static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
-{
- int dump_ok;
-
- /* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & VM_IO) {
- kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
- return 0;
- }
-
- /* If we may not read the contents, don't allow us to dump
- * them either. "dump_write()" can't handle it anyway.
- */
- if (!(vma->vm_flags & VM_READ)) {
- kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags);
- return 0;
- }
-
- /* support for DAX */
- if (vma_is_dax(vma)) {
- if (vma->vm_flags & VM_SHARED) {
- dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- } else {
- dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- }
- return dump_ok;
- }
-
- /* By default, dump shared memory if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED) {
- if (file_inode(vma->vm_file)->i_nlink == 0) {
- dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-
- dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-
-#ifdef CONFIG_MMU
- /* By default, if it hasn't been written to, don't write it out */
- if (!vma->anon_vma) {
- dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-#endif
-
- dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags,
- dump_ok ? "yes" : "no");
- return dump_ok;
-}
-
/* An ELF note in memory */
struct memelfnote
{
@@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
/*
* dump the segments for an MMU process
*/
-static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
+ struct core_vma_metadata *vma_meta,
+ int vma_count)
{
- struct vm_area_struct *vma;
+ int i;
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-#ifdef CONFIG_MMU
- unsigned long addr;
-#endif
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
- if (!maydump(vma, cprm->mm_flags))
- continue;
-
-#ifdef CONFIG_MMU
- for (addr = vma->vm_start; addr < vma->vm_end;
- addr += PAGE_SIZE) {
- bool res;
- struct page *page = get_dump_page(addr);
- if (page) {
- void *kaddr = kmap(page);
- res = dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
- put_page(page);
- } else {
- res = dump_skip(cprm, PAGE_SIZE);
- }
- if (!res)
- return false;
- }
-#else
- if (!dump_emit(cprm, (void *) vma->vm_start,
- vma->vm_end - vma->vm_start))
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
return false;
-#endif
}
return true;
}
-static size_t elf_core_vma_data_size(unsigned long mm_flags)
-{
- struct vm_area_struct *vma;
- size_t size = 0;
-
- for (vma = current->mm->mmap; vma; vma = vma->vm_next)
- if (maydump(vma, mm_flags))
- size += vma->vm_end - vma->vm_start;
- return size;
-}
-
/*
* Actual dumper
*
@@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int segs;
+ int vma_count, segs;
int i;
- struct vm_area_struct *vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
struct memelfnote psinfo_note, auxv_note;
@@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
-
- /*
- * We no longer stop all VM operations.
- *
- * This is because those proceses that could possibly change map_count
- * or the mmap / vma pages are now blocked in do_exit on current
- * finishing this core dump.
- *
- * Only ptrace can touch these memory addresses, but it doesn't change
- * the map_count or the pages allocated. So no possibility of crashing
- * exists while dumping the mm->vm_next areas to the core file.
- */
+ struct core_vma_metadata *vma_meta = NULL;
+ size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
+ if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+ goto end_coredump;
+
for (ct = current->mm->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = current->mm->map_count;
- segs += elf_core_extra_phdrs();
+ segs = vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += elf_core_vma_data_size(cprm->mm_flags);
+ offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
struct elf_phdr phdr;
size_t sz;
- sz = vma->vm_end - vma->vm_start;
+ sz = meta->end - meta->start;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
- phdr.p_vaddr = vma->vm_start;
+ phdr.p_vaddr = meta->start;
phdr.p_paddr = 0;
- phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
+ phdr.p_filesz = meta->dump_size;
phdr.p_memsz = sz;
offset += phdr.p_filesz;
- phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
- if (vma->vm_flags & VM_WRITE)
+ phdr.p_flags = 0;
+ if (meta->flags & VM_READ)
+ phdr.p_flags |= PF_R;
+ if (meta->flags & VM_WRITE)
phdr.p_flags |= PF_W;
- if (vma->vm_flags & VM_EXEC)
+ if (meta->flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
@@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- if (!elf_fdpic_dump_segments(cprm))
+ if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1776,6 +1667,7 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
+ kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f2f9086ebe98..b9c658e0548e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- len = data_len + extra;
+ len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm,
vm_munmap(textpos, text_len);
goto err;
}
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(unsigned long),
+ FLAT_DATA_ALIGN);
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
data_len + bss_len + stack_len, datapos);
@@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
- len = text_len + data_len + extra;
+ len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm,
}
realdatastart = textpos + ntohl(hdr->data_start);
- datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+ datapos = ALIGN(realdatastart +
+ MAX_SHARED_LIBS * sizeof(u32),
+ FLAT_DATA_ALIGN);
reloc = (__be32 __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
@@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm,
(text_len + full_data
- sizeof(struct flat_hdr)),
0);
- if (datapos != realdatastart)
- memmove((void *)datapos, (void *)realdatastart,
- full_data);
+ memmove((void *) datapos, (void *) realdatastart,
+ full_data);
#else
/*
* This is used on MMU systems mainly for testing.
@@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm,
if (IS_ERR_VALUE(result)) {
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
- vm_munmap(textpos, text_len + data_len + extra);
+ vm_munmap(textpos, text_len + data_len + extra +
+ MAX_SHARED_LIBS * sizeof(u32));
goto err;
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8ae833e00443..9e84b1928b94 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ struct block_device *claimed_bdev = NULL;
+ int err;
+
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & FMODE_EXCL)) {
+ claimed_bdev = bdev->bd_contains;
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
+ truncate_bdev_range);
+ if (err)
+ return err;
+ }
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (claimed_bdev)
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ return 0;
+}
+EXPORT_SYMBOL(truncate_bdev_range);
+
static void set_init_blocksize(struct block_device *bdev)
{
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -862,7 +891,7 @@ static int bdev_set(struct inode *inode, void *data)
return 0;
}
-struct block_device *bdget(dev_t dev)
+static struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
@@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev)
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
+ spin_lock_init(&bdev->bd_size_lock);
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -891,8 +920,6 @@ struct block_device *bdget(dev_t dev)
return bdev;
}
-EXPORT_SYMBOL(bdget);
-
/**
* bdgrab -- Grab a reference to an already referenced block device
* @bdev: Block device to grab a reference to.
@@ -904,6 +931,11 @@ struct block_device *bdgrab(struct block_device *bdev)
}
EXPORT_SYMBOL(bdgrab);
+struct block_device *bdget_part(struct hd_struct *part)
+{
+ return bdget(part_devt(part));
+}
+
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -1290,6 +1322,7 @@ static void check_disk_size_change(struct gendisk *disk,
{
loff_t disk_size, bdev_size;
+ spin_lock(&bdev->bd_size_lock);
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
@@ -1299,85 +1332,51 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ }
+ spin_unlock(&bdev->bd_size_lock);
+
+ if (bdev_size > disk_size) {
+ if (__invalidate_device(bdev, false))
pr_warn("VFS: busy inodes on resized disk %s\n",
disk->disk_name);
}
- bdev->bd_invalidated = 0;
}
/**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
- * @disk: struct gendisk to be revalidated
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @verbose: if %true log a message about a size change if there is any
*
- * This routine is a wrapper for lower-level driver's revalidate_disk
- * call-backs. It is used to do common pre and post operations needed
- * for all revalidate_disk operations.
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-int revalidate_disk(struct gendisk *disk)
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
- int ret = 0;
-
- if (disk->fops->revalidate_disk)
- ret = disk->fops->revalidate_disk(disk);
+ struct block_device *bdev;
/*
* Hidden disks don't have associated bdev so there's no point in
- * revalidating it.
+ * revalidating them.
*/
- if (!(disk->flags & GENHD_FL_HIDDEN)) {
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (!bdev)
- return ret;
+ if (disk->flags & GENHD_FL_HIDDEN)
+ return;
- mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev, ret == 0);
- mutex_unlock(&bdev->bd_mutex);
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ check_disk_size_change(disk, bdev, verbose);
bdput(bdev);
}
- return ret;
}
-EXPORT_SYMBOL(revalidate_disk);
+EXPORT_SYMBOL(revalidate_disk_size);
-/*
- * This routine checks whether a removable media has been changed,
- * and invalidates all buffer-cache-entries in that case. This
- * is a relatively slow routine, so we have to try to minimize using
- * it. Thus it is called only upon a 'mount' or 'open'. This
- * is the best way of combining speed and utility, I think.
- * People changing diskettes in the middle of an operation deserve
- * to lose :-)
- */
-int check_disk_change(struct block_device *bdev)
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
- struct gendisk *disk = bdev->bd_disk;
- const struct block_device_operations *bdops = disk->fops;
- unsigned int events;
-
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return 0;
-
- if (__invalidate_device(bdev, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
- bdev->bd_invalidated = 1;
- if (bdops->revalidate_disk)
- bdops->revalidate_disk(bdev->bd_disk);
- return 1;
-}
-
-EXPORT_SYMBOL(check_disk_change);
-
-void bd_set_size(struct block_device *bdev, loff_t size)
-{
- inode_lock(bdev->bd_inode);
- i_size_write(bdev->bd_inode, size);
- inode_unlock(bdev->bd_inode);
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ spin_unlock(&bdev->bd_size_lock);
}
-EXPORT_SYMBOL(bd_set_size);
+EXPORT_SYMBOL(bd_set_nr_sectors);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
@@ -1388,6 +1387,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
+ clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+
rescan:
ret = blk_drop_partitions(bdev);
if (ret)
@@ -1446,22 +1447,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
struct gendisk *disk;
int ret;
int partno;
- int perm = 0;
bool first_open = false, unblock_events = true, need_restart;
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- if (!for_part) {
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0)
- return ret;
- }
-
restart:
need_restart = false;
ret = -ENXIO;
@@ -1514,7 +1501,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
}
if (!ret) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bd_set_nr_sectors(bdev, get_capacity(disk));
set_init_blocksize(bdev);
}
@@ -1524,7 +1511,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
@@ -1542,7 +1529,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
ret = -ENXIO;
goto out_clear;
}
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
@@ -1554,7 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
@@ -1632,16 +1619,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* RETURNS:
* 0 on success, -errno on failure.
*/
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- int res;
+ int ret, perm = 0;
- res =__blkdev_get(bdev, mode, holder, 0);
- if (res)
- bdput(bdev);
- return res;
+ if (mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret)
+ goto bdput;
+
+ ret =__blkdev_get(bdev, mode, holder, 0);
+ if (ret)
+ goto bdput;
+ return 0;
+
+bdput:
+ bdput(bdev);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get);
/**
* blkdev_get_by_path - open a block device by name
@@ -1889,7 +1887,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode))
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
return -ETXTBSY;
if (!iov_iter_count(from))
@@ -1969,7 +1967,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
int error;
@@ -1997,8 +1994,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EINVAL;
/* Invalidate the page cache, including dirty pages. */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ return error;
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
@@ -2025,7 +2023,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
* the caller will be given -EBUSY. The third argument is
* inclusive, so the rounding here is safe.
*/
- return invalidate_inode_pages2_range(mapping,
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..68b95ad82126 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ea1c28ccb44f..b3268f4ea5f3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
while (!list_empty(&pending_edge)) {
struct btrfs_backref_node *upper;
struct btrfs_backref_node *lower;
- struct rb_node *rb_node;
edge = list_first_entry(&pending_edge,
struct btrfs_backref_edge, list[UPPER]);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 613920c17ac1..c0f1d6818df7 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
}
static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -1798,7 +1792,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
@@ -1874,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static int read_block_group_item(struct btrfs_block_group *cache,
+static void read_block_group_item(struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct btrfs_key *key)
{
@@ -1888,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache,
sizeof(bgi));
cache->used = btrfs_stack_block_group_used(&bgi);
cache->flags = btrfs_stack_block_group_flags(&bgi);
-
- return 0;
}
static int read_one_block_group(struct btrfs_fs_info *info,
@@ -1908,9 +1899,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- ret = read_block_group_item(cache, path, key);
- if (ret < 0)
- goto error;
+ read_block_group_item(cache, path, key);
+
+ set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2034,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2055,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -2096,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
return;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2110,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2132,6 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return -ENOMEM;
cache->length = size;
+ set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
@@ -2783,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
@@ -2959,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2992,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -3000,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3336,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->block_group_cache_lock);
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
-
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c47b6c6fea9f..92dd86bceae3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,14 +21,18 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -212,6 +216,11 @@ struct btrfs_inode {
struct inode vfs_inode;
};
+static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
+{
+ return inode->root->fs_info->sectorsize;
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -324,23 +333,6 @@ struct btrfs_dio_private {
u8 csums[];
};
-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1ab56a734e70..eeface30facd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,41 +29,6 @@
#include "extent_io.h"
#include "extent_map.h"
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
-void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
-
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
-void lzo_free_workspace(struct list_head *ws);
-
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
-
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9f3dbe372631..8001b700ea3a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 70e49d8d4f6c..113da62dc17f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type)
btrfs_csums[csum_type].name;
}
-size_t __const btrfs_get_num_csums(void)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
@@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1297,6 +1306,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1370,7 +1381,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1378,6 +1388,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
if (tm)
__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
@@ -1442,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1481,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1653,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1851,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1887,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1902,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2065,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2073,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2119,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2128,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2597,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2736,11 +2755,13 @@ again:
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2871,7 +2892,8 @@ cow_done:
} else {
if (!btrfs_tree_read_lock_atomic(b)) {
btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -3160,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3203,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3271,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3327,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3457,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3726,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3735,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3747,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3959,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -3970,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -3984,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4232,8 +4330,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4478,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4653,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4671,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4697,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4730,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
data_end -= data_size[i];
+ btrfs_set_token_item_offset(&token, item, data_end);
btrfs_set_token_item_size(&token, item, data_size[i]);
}
@@ -4773,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -5111,7 +5229,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5375,7 +5493,9 @@ again:
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5410,7 +5530,9 @@ again:
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c7e466f27a9..aac3d6f4e35b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,6 +374,7 @@ struct btrfs_path {
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_DONE = 2,
};
-void btrfs_init_async_reclaim_work(struct work_struct *work);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
struct reloc_control;
@@ -541,11 +542,6 @@ enum {
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -565,6 +561,19 @@ enum {
BTRFS_FS_DISCARD_RUNNING,
};
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -912,6 +921,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -935,6 +945,9 @@ struct btrfs_fs_info {
*/
int send_in_progress;
+ /* Type of exclusive operation running */
+ unsigned long exclusive_operation;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1181,24 +1194,40 @@ struct btrfs_root {
#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
@@ -2262,7 +2299,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __const btrfs_get_num_csums(void);
+size_t __attribute_const__ btrfs_get_num_csums(void);
/*
@@ -2518,13 +2555,14 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr);
+ u64 objectid, u64 offset, u64 bytenr, bool strict);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum {
*
* Can be interruped by fatal signal.
*/
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
/*
@@ -2619,7 +2659,7 @@ enum btrfs_flush_state {
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
@@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+ int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2930,11 +2969,15 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes);
+ u64 *ram_bytes, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
@@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
@@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0e354e9e57d0..bacee09b7bfd 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf1595a42a98..5aba81e16113 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- fs_info->nodesize, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
}
/* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ setup_items_for_insert(root, path, keys, data_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index db93909b25e0..4a0243cb9d97 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -64,10 +64,6 @@
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
@@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding) {
+ if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
@@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
-}
-
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ btrfs_rm_dev_replace_free_srcdev(src_device);
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+ return 0;
}
/*
@@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data)
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ae25f632157..8e3438672a82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
#endif
/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- return ret;
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
u64 found_start;
int found_level;
@@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
read_extent_buffer(eb, &val, 0, csum_size);
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+ CSUM_FMT_VALUE(csum_size, val),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
goto err;
}
@@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info,
return 1;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(fs_info, BTRFS_I(inode));
@@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
-{
- return extent_read_full_page(page, btree_get_extent, 0);
-}
-
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
@@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page)
}
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
@@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
btrfs_put_root(root);
return ERR_CAST(leaf);
@@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *root;
while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
- btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
- root->root_key.objectid, root->root_key.offset,
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(root->root_key.objectid, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
+ IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
+ if (IS_ERR(tree_root->node)) {
handle_error = true;
+ ret = PTR_ERR(tree_root->node);
+ tree_root->node = NULL;
+ btrfs_warn(fs_info, "couldn't read tree root");
+ continue;
- if (IS_ERR(tree_root->node)) {
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
- ret = -EUCLEAN;
- }
-
- btrfs_warn(fs_info, "failed to read tree root");
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+ ret = -EIO;
+ btrfs_warn(fs_info, "error while reading tree root");
continue;
}
@@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+ btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type(disk_super);
@@ -3091,8 +3032,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3418,6 +3357,8 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -3481,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_CAST(page);
super = page_address(page);
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -4055,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4551,6 +4497,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
@@ -4685,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
-
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 00dc39d47ed3..fee69ced58b4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 219a09a2b734..9800a8306368 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -40,6 +40,7 @@ struct io_failure_record;
enum {
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
@@ -48,6 +49,7 @@ enum {
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
@@ -61,7 +63,6 @@ struct extent_io_tree {
u8 owner;
spinlock_t lock;
- const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6fe176fdfb..3b21fee13e77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -400,12 +400,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -414,12 +413,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -429,8 +427,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -1178,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
@@ -1398,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -2306,7 +2323,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
@@ -2348,9 +2366,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2375,7 +2397,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
@@ -2386,7 +2408,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
@@ -2845,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2931,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -2963,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -2972,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -2988,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
@@ -3005,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3079,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3103,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
@@ -3118,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
@@ -3165,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3914,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
+static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
@@ -3950,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
@@ -4199,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
@@ -4500,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
@@ -4522,8 +4679,8 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
- btrfs_tree_lock(buf);
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4569,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4585,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4602,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6def411b2eba..60f5f68d892d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
@@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (!btrfs_submit_read_repair(inode, bio, offset, page,
start - page_offset(page),
start, end, mirror,
- tree->ops->submit_bio_hook)) {
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page)
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page,
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
- end_bio_extent_readpage, mirror_num,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -4552,7 +4516,7 @@ next:
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
@@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4898,7 +4861,7 @@ out_free:
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5622,6 +5584,36 @@ unlock_exit:
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
offset = offset_in_page(start);
@@ -5655,9 +5643,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5677,7 +5665,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
offset = offset_in_page(start);
@@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *src = (char *)srcv;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
char *kaddr;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst,
char *kaddr;
unsigned long i = dst_offset >> PAGE_SHIFT;
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
+
WARN_ON(src->len != dst_len);
offset = offset_in_page(dst_offset);
@@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
dst_off_in_page = offset_in_page(dst_offset);
@@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 00a88f2eb5ab..f39d02e7f7ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- submit_bio_hook_t *submit_bio_hook;
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
-
-
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
@@ -102,7 +90,7 @@ struct extent_buffer {
int blocking_writers;
atomic_t blocking_readers;
- bool lock_nested;
+ bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num);
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
@@ -241,9 +232,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7d5ec71615b8..8f4f2bd6d9b9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
+ count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
+ disk_bytenr, csum, nblocks);
if (count)
goto found;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb824c7cb7c7..0ff659455b1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1057,11 +1057,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1571,7 +1564,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
if (!nowait)
@@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges and races between logging and
- * completion of ordered extents for adjancent ranges - both races
- * could lead to file extent items in the log with overlapping ranges.
- * Do this while holding the inode lock, to avoid races with other
- * tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
smp_mb();
if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
+ (full_sync || list_empty(&ctx.ordered_extents)))) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
@@ -2286,6 +2335,12 @@ out:
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
return 0;
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
+ if (extent_info->disk_offset == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- clone_info->file_offset, clone_len);
+ extent_info->file_offset, replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0)
return 0;
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* returned by __btrfs_drop_extents() without having
* changed anything in the file.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (extent_info && !extent_info->is_new_extent &&
+ ret && ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
@@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
cur_offset = drop_end;
@@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!clone_info) {
+ if (!extent_info) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
*/
- if (clone_info)
+ if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
if (ret) {
@@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
@@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
btrfs_free_path(path);
if (ret)
@@ -3044,7 +3119,7 @@ enum {
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode,
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode,
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
@@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ef0fd7afb0b1..af0013d3df63 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
@@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -1186,7 +1183,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
@@ -1347,13 +1343,14 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state);
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 8b1f5c8897b7..6b9faf3b0e96 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51fcd82d41c0..936c3137c646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,7 +6,6 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -31,6 +30,7 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -59,9 +59,10 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -1610,7 +1603,7 @@ next_slot:
goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
- extent_offset, disk_bytenr);
+ extent_offset, disk_bytenr, false);
if (ret) {
/*
* ret could be -EIO if the above fails to read
@@ -2161,11 +2154,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
struct inode *inode = private_data;
- blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
@@ -2186,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2248,16 +2237,15 @@ out:
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2360,7 +2348,7 @@ again:
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2551,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
}
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
@@ -2571,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
- return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
- &stack_fi, oe->qgroup_rsv);
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ oe->qgroup_rsv);
}
/*
@@ -2669,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_ordered_extent_file_extent(trans, inode,
- ordered_extent);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2686,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -2755,7 +2742,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(inode, ordered_extent);
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2775,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
@@ -2787,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -2836,9 +2823,8 @@ zeroit:
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror)
{
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
@@ -3058,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
@@ -3398,7 +3383,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -4054,7 +4038,7 @@ out_end_trans:
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (err) {
@@ -4586,7 +4570,7 @@ again:
&cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -4851,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5308,15 +5289,15 @@ static void inode_tree_add(struct inode *inode)
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -6314,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6377,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -6392,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -6543,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6572,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
@@ -6582,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -6595,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
*/
path->leave_spinning = 1;
+ path->recurse = btrfs_is_free_space_inode(inode);
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -6628,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
@@ -6646,12 +6621,11 @@ next:
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6702,10 +6676,8 @@ next:
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6729,29 +6701,28 @@ not_found:
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
- BUG_ON(!em); /* Error is always set */
return em;
}
@@ -6953,6 +6924,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks for (nowait == false) case.
@@ -6967,7 +6940,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+ u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
@@ -7045,8 +7018,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
* Do the same check as in btrfs_cross_ref_exist but without the
* unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
+ if (!strict &&
+ (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
backref_offset = btrfs_file_extent_offset(leaf, fi);
@@ -7082,7 +7056,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr);
+ key.offset - backref_offset, disk_bytenr,
+ strict);
if (ret) {
ret = 0;
goto out;
@@ -7110,7 +7085,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7159,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7248,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7303,7 +7255,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
+ &orig_block_len, &ram_bytes, false) == 1 &&
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
@@ -7332,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
@@ -7343,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
+
+ /*
+ * We used current->journal_info here to see if we were sync, but
+ * there's a lot of tests in the enospc machinery to not do flushing if
+ * we have a journal_info set, so we need to clear this out and re-set
+ * it in iomap_end.
+ */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
}
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->sync = sync;
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
+ }
+ iomap->private = dio_data;
+
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7432,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
}
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
free_extent_map(em);
@@ -7470,8 +7457,63 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, start,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
+ length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ /*
+ * We're all done, we can re-set the current->journal_info now safely
+ * for our endio.
+ */
+ if (dio_data->sync) {
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
+ kfree(dio_data);
+ iomap->private = NULL;
+
return ret;
}
@@ -7495,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}
- dio_end_io(dip->dio_bio);
+ bio_endio(dip->dio_bio);
kfree(dip);
}
@@ -7619,10 +7661,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
- blk_status_t ret;
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7731,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
return dip;
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7765,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7773,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}
if (!write && csum) {
@@ -7845,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return;
+ return BLK_QC_T_NONE;
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -7889,37 +7919,59 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned flags)
+{
+ /*
+ * Now if we're still in the context of our submitter we know we can't
+ * safely run generic_write_sync(), so clear our flag here so that the
+ * caller knows to follow up with a sync.
+ */
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+ current->journal_info = NULL;
+ return error;
+ }
+
+ if (error)
+ return error;
+
+ if (size) {
+ iocb->ki_flags |= IOCB_DSYNC;
+ return generic_write_sync(iocb, size);
+ }
+
+ return 0;
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+static const struct iomap_dio_ops btrfs_sync_dops = {
+ .submit_io = btrfs_submit_direct,
+ .end_io = btrfs_maybe_fsync_end_io,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
if (check_direct_IO(fs_info, iter, offset))
return 0;
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -7927,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * We have are actually a sync iocb, so we need our fancy endio to know
+ * if we need to sync.
+ */
+ if (current->journal_info)
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
+ else
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, offset, dio_data.reserve,
- true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(BTRFS_I(inode),
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
+
if (relock)
inode_lock(inode);
@@ -8003,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
- return extent_read_full_page(page, btrfs_get_extent, 0);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ unsigned long bio_flags = 0;
+ struct bio *bio = NULL;
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+ if (bio)
+ ret = submit_one_bio(bio, 0, bio_flags);
+ return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8092,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping,
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -8111,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
@@ -8121,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
lock_extent_bits(tree, page_start, page_end, &cached_state);
again:
start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
@@ -8143,7 +8168,7 @@ again:
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8182,7 +8207,7 @@ again:
* bit of its io_tree, and free the qgroup reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8284,7 +8309,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8615,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8644,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
@@ -8652,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
- btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
- btrfs_put_root(BTRFS_I(inode)->root);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -8781,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -8943,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -8999,46 +9000,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -9106,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9260,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9307,23 +9264,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9389,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9429,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
+ if (*nr != U64_MAX) {
+ (*nr)--;
+ if (*nr == 0)
+ goto out;
+ }
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -9456,18 +9400,15 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &nr, true);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -9490,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
+ ret = start_delalloc_inodes(root, &nr, false);
btrfs_put_root(root);
if (ret < 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -9569,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -9634,11 +9569,15 @@ out_unlock:
return err;
}
-static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
struct inode *inode, struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int ret;
@@ -9655,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
if (ret < 0)
- return ret;
- return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
- &stack_fi, ret);
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
+ file_offset, &stack_fi, ret);
+ if (ret)
+ return ERR_PTR(ret);
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.qgroup_reserved = ret;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return trans;
}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9681,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -9700,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -9717,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
@@ -9786,8 +9742,10 @@ next:
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
@@ -9866,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
@@ -10073,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
@@ -10136,7 +10093,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -10226,7 +10183,7 @@ out:
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -10284,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -10307,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd3511c5ca81..ab408a23ba32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags)
return 0;
}
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
@@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
@@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -742,7 +755,7 @@ fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
err = btrfs_commit_transaction(trans);
if (err && !ret)
@@ -856,7 +869,7 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -1306,7 +1319,7 @@ again:
break;
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
mnt_drop_write_file(file);
return ret;
}
@@ -2086,9 +2099,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2096,10 +2114,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2184,6 +2206,11 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
+ if (ret)
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -3112,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -3129,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3158,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -3169,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3200,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -3218,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3448,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3504,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3722,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -3937,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -3983,7 +4004,6 @@ again:
}
locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4038,10 +4058,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
@@ -4060,7 +4080,7 @@ out_bargs:
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
return ret;
@@ -4883,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f75612e18a82..66e02ebdd340 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,8 +57,8 @@
* performance reasons.
*
*
- * Lock nesting
- * ------------
+ * Lock recursion
+ * --------------
*
* A write operation on a tree might indirectly start a look up on the same
* tree. This can happen when btrfs_cow_block locks the tree and needs to
@@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
@@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
@@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*
* The rwlock is held upon exit.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse)
{
u64 start_ns = 0;
@@ -263,8 +264,9 @@ again:
* depends on this as it may be called on a partly
* (write-)locked tree.
*/
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
+ WARN_ON(!recurse);
+ BUG_ON(eb->lock_recursed);
+ eb->lock_recursed = true;
read_unlock(&eb->lock);
trace_btrfs_tree_read_lock(eb, start_ns);
return;
@@ -279,6 +281,11 @@ again:
trace_btrfs_tree_read_lock(eb, start_ns);
}
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
+}
+
/*
* Lock extent buffer for read, optimistically expecting that there are no
* contending blocking writers. If there are, don't wait.
@@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*
* The rwlock is held for write upon exit.
*/
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -434,6 +441,11 @@ again:
trace_btrfs_tree_lock(eb, start_ns);
}
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
/*
* Release the write lock, either blocking or spinning (ie. there's no need
* for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
@@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d715846c10b8..3ea81ed3320b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -16,11 +16,81 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
struct btrfs_path;
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
@@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse);
+
+static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ return __btrfs_read_lock_root_node(root, false);
+}
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ebac13389e7e..87bac9ecdf4c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
+ trace_btrfs_ordered_extent_add(inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -377,17 +378,16 @@ out:
* test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
* to make sure this function only returns 1 once for a given ordered extent.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -408,7 +408,7 @@ have_entry:
}
if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
@@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
@@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -775,17 +803,45 @@ out:
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,20 +859,21 @@ out:
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
if (!ordered)
return 0;
@@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr &&
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
+ i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
+ num_sectors = ordered_sum->len >> blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
@@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
break;
}
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d61ea9c880a3..c3a2325e64a4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,6 +56,12 @@ enum {
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
};
struct btrfs_ordered_extent {
@@ -104,6 +110,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* used for fast fsyncs */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
}
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
@@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 61f44e78e3c9..7695c4783d33 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,44 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(u64 objectid, char *buf)
+{
+ int i;
+
+ if (objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", objectid);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
@@ -95,9 +133,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -112,8 +151,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38fd75ad..78b99385a503 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(u64 objectid, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c0f350c3a0cf..580899bdb991 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 243a2e44526e..9d4f5316a7e8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work)
kfree(rmw);
}
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
- int i;
+ struct btrfs_device *device;
-again:
do {
enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+ return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ int i;
+ u64 enqueued = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ enqueued += reada_start_for_fsdevs(fs_devices);
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ enqueued += reada_start_for_fsdevs(seed_devs);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
if (enqueued == 0)
return;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 5cd02514cf4d..99aa87c08912 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -45,7 +45,7 @@ out:
return ret;
}
-static int copy_inline_to_page(struct inode *inode,
+static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
@@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode,
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
@@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- file_offset, block_size);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
if (ret)
goto out;
- page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(inode->i_mapping));
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
set_page_extent_mapped(page);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
- 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -134,9 +134,9 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- file_offset, block_size, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
out:
extent_changeset_free(data_reserved);
@@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(dst, new_key->offset,
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
inline_data, size, datal,
comp_type);
goto out;
@@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -231,8 +231,8 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -439,7 +439,7 @@ process_slot:
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
+ struct btrfs_replace_extent_info clone_info;
/*
* a | --- range to clone ---| b
@@ -462,8 +462,8 @@ process_slot:
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path, drop_start,
+ clone_info.is_new_extent = false;
+ ret = btrfs_replace_file_extents(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
if (ret)
@@ -520,6 +520,8 @@ process_slot:
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
ret = 0;
@@ -533,7 +535,7 @@ process_slot:
btrfs_release_path(path);
path->leave_spinning = 0;
- ret = btrfs_punch_hole_range(inode, path, last_dest_end,
+ ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4ba1ab9cc76d..3602806d71bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1206,7 +1206,8 @@ again:
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1274,7 +1275,8 @@ again:
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
+ slot, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
if (ret < 0)
@@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index c89697486366..702dc5441f03 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a6cb9db512e..cf63f1e27a27 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
@@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
@@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
return 0;
}
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
+
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
+ }
+}
+
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
- lockdep_assert_held(&fs_info->scrub_lock);
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
- if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL);
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
- flags, is_dev_replace ? 1 : max_active, 4);
- if (!fs_info->scrub_workers)
- goto fail_scrub_workers;
-
- ASSERT(fs_info->scrub_wr_completion_workers == NULL);
- fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
- if (!fs_info->scrub_wr_completion_workers)
- goto fail_scrub_wr_completion_workers;
+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
+ is_dev_replace ? 1 : max_active, 4);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
- ASSERT(fs_info->scrub_parity_workers == NULL);
- fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_parity_workers)
- goto fail_scrub_parity_workers;
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ max_active, 2);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
- } else {
- refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
}
- return 0;
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = 0;
+ btrfs_destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(scrub_workers);
fail_scrub_workers:
- return -ENOMEM;
+ return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
- goto out_free_ctx;
+ goto out;
}
if (!is_dev_replace && !readonly &&
@@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
ret = -EROFS;
- goto out_free_ctx;
+ goto out;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
- goto out_free_ctx;
+ goto out;
}
down_read(&fs_info->dev_replace.rwsem);
@@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
- goto out_free_ctx;
+ goto out;
}
up_read(&fs_info->dev_replace.rwsem);
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out_free_ctx;
- }
-
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
-
- fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
- }
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
return ret;
-
+out:
+ scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9813a5b075a..340c76a12ce1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -122,8 +122,6 @@ struct send_ctx {
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -278,11 +276,6 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx)
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
}
/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
* This does all the move/link/unlink/rmdir magic.
*/
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
@@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * than the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct name_cache_entry *nce;
struct waiting_dir_move *wdm;
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
@@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
}
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
if (ret < 0)
@@ -4799,7 +4911,25 @@ out:
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
+ int ret;
+
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
@@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5033,8 +5150,8 @@ out:
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
-
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, void *ctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx)
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7378,7 +7466,6 @@ out:
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f7034f..de91488b7cd0 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -13,7 +13,6 @@
#define BTRFS_SEND_STREAM_VERSION 1
#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 475968ccbd1d..64099565ab8f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
@@ -476,28 +465,6 @@ again:
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
- u64 async_pages;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly, so increase the amount to reclaim by 2x in order to
+ * make sure we're flushing enough delalloc to hopefully reclaim
+ * some metadata reservations.
+ */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ }
trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ btrfs_start_delalloc_roots(fs_info, items);
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -596,14 +551,6 @@ skip_async:
}
spin_unlock(&space_info->lock);
- loops++;
- if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- } else {
- time_left = schedule_timeout_killable(1);
- if (time_left)
- break;
- }
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
@@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes_needed;
u64 reclaim_bytes = 0;
+ u64 bytes_needed = 0;
u64 cur_free_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
+ if (ticket)
+ bytes_needed = ticket->bytes;
if (bytes_needed > cur_free_bytes)
bytes_needed -= cur_free_bytes;
@@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
goto commit;
/*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
+ * See if there is some space in the delayed insertion reserve for this
+ * reservation. If the space_info's don't match (like for DATA or
+ * SYSTEM) then just go enospc, reclaiming this space won't recover any
+ * space to satisfy those reservations.
*/
if (space_info != delayed_rsv->space_info)
goto enospc;
@@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ shrink_delalloc(fs_info, space_info, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
@@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * FLUSH_DELAYED_REFS
+ * The above two cases generate delayed refs that will affect
+ * ->total_bytes_pinned. However this counter can be inconsistent with
+ * reality if there are outstanding delayed refs. This is because we adjust
+ * the counter based solely on the current set of delayed refs and disregard
+ * any on-disk state which might include more refs. So for example, if we
+ * have an extent with 2 references, but we only drop 1, we'll see that there
+ * is a negative delayed ref count for the extent and assume that the space
+ * will be freed, and thus increase ->total_bytes_pinned.
+ *
+ * Running the delayed refs gives us the actual real view of what will be
+ * freed at the transaction commit time. This stage will not actually free
+ * space for us, it just makes sure that may_commit_transaction() has all of
+ * the information it needs to make the right decision.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by the previous
+ * two stages. We will not commit the transaction if we don't think we're
+ * likely to satisfy our request, which means if our current free space +
+ * total_bytes_pinned < reservation we will not commit. This is why the
+ * previous states are actually important, to make sure we know for sure
+ * whether committing the transaction will allow us to make progress.
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_WAIT,
+ RUN_DELAYED_IPUTS,
+ FLUSH_DELAYED_REFS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ int flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state]);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
} while (flush_state < states_nr);
}
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
@@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
@@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
@@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
@@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
- flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
return ret;
}
+
+/**
+ * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
+ * @fs_info - the filesystem
+ * @bytes - the number of bytes we need
+ * @flush - how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c3c64019950a..5646393b928c 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 079b059818e9..c46be27be700 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,16 +7,6 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
{
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e529ddb35b87..8840a4fa81eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -625,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -1870,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
@@ -2162,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, &fs_info->space_info, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2192,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c8df2edafd85..279d9262b676 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -14,6 +14,7 @@
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
@@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
NULL
};
@@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
BTRFS_ATTR(, checksum, btrfs_checksum_show);
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
NULL,
};
@@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
- btrfs_reset_fs_info_ptr(fs_info);
-
sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
@@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(fsid_kobj, btrfs_attrs);
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->devices_kobj)
- return -EINVAL;
-
- if (one_device) {
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
-
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- return 0;
+ if (device->bdev) {
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(devices_kobj, disk_kobj->name);
}
- list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
-
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
@@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
+ int ret;
unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
- nofs_flag = memalloc_nofs_save();
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
- if (one_device && one_device != dev)
- continue;
+ nofs_flag = memalloc_nofs_save();
- if (dev->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ if (device->bdev) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->devices_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
}
+ }
- init_completion(&dev->kobj_unregister);
- error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devinfo_kobj, "%llu",
- dev->devid);
- if (error) {
- kobject_put(&dev->devid_kobj);
- break;
- }
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
}
+
+out:
memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
- return error;
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
@@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index cf839c46a131..bacef43f7267 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,15 +13,12 @@ enum btrfs_feature_set {
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b5978e..df54cdfdc250 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, &key, &value_len, 1);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 894a63a92236..e6719f7db386 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
}
/*
@@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 20c6ac1a5de7..52ada47aff50 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -292,6 +292,8 @@ loop:
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -1636,6 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -2164,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(trans);
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d60b055b8695..858d9153a1cd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -85,6 +85,13 @@ struct btrfs_transaction {
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -105,6 +112,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
+#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 517b44300a05..f0ffd5ee77bd 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf,
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
inode_item_err(leaf, slot,
- "invalid inode generation: has %llu expect [0, %llu]",
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
}
@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_root_item ri;
+ struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (ret < 0)
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
generic_err(leaf, slot,
- "invalid root item size, have %u expect %zu",
- btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
}
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- sizeof(ri));
+ btrfs_item_size_nr(leaf, slot));
/* Generation related */
if (btrfs_root_generation(&ri) >
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 696dd861cc3c..56cbc1706b6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,6 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -3449,11 +3445,13 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
@@ -3613,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3632,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
if (min_key.objectid != ino || min_key.type != key_type)
goto done;
+
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
@@ -4080,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4092,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
@@ -4138,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
int ret;
int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4340,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -4357,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4432,8 +4483,32 @@ process:
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4839,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
+ ctx);
btrfs_add_delayed_iput(inode);
}
}
@@ -4881,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -4897,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5110,8 +5185,6 @@ next_key:
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
@@ -5290,7 +5363,7 @@ log_extents:
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5299,31 +5372,8 @@ log_extents:
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
@@ -5337,19 +5387,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5589,7 +5654,7 @@ process_leaf:
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
+ log_mode, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
@@ -5733,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ LOG_INODE_ALL, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
ret = 1;
@@ -5784,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation > last_committed)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5840,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation > fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5948,8 +6012,6 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -6002,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6098,15 +6160,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6369,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6403,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..731bd9c029f5 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,8 +16,11 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ee96c5869f57..58b9c419a2b6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
@@ -290,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -317,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
@@ -355,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -405,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
-static struct btrfs_device *__alloc_device(void)
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
@@ -432,7 +434,8 @@ static struct btrfs_device *__alloc_device(void)
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
return dev;
}
@@ -592,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path,
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -940,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_warn_in_rcu(device->fs_info,
- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
- "device fsid %pU devid %llu moved old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -1034,28 +1037,21 @@ error:
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ int step, struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
+ &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
@@ -1093,10 +1089,22 @@ again:
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1148,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
+ lockdep_assert_held(&uuid_mutex);
+
if (--fs_devices->opened > 0)
- return 0;
+ return;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = false;
-
- return 0;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1196,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
if (fs_devices->open_devices == 0)
return -EINVAL;
@@ -1960,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device(
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
@@ -1998,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
{
struct btrfs_super_block *disk_super;
int copy_num;
@@ -2039,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2143,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2164,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
+ list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
@@ -2220,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(fs_info, srcdev->bdev,
- srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2235,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2245,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
@@ -2265,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2374,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
@@ -2398,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
- fs_devices->seed = seed_devices;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2510,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2524,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2612,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
- /* add sysfs device entry */
- btrfs_sysfs_add_devices_dir(fs_devices, device);
-
/*
* we've got more storage, clear any full flags on the space
* infos
@@ -2622,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
@@ -2647,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_prepare_sprout
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2656,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2691,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
error_sysfs:
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2717,7 +2711,7 @@ error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -4044,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -4180,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4191,7 +4185,7 @@ out:
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4293,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4375,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -4462,6 +4456,7 @@ int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
@@ -4486,6 +4481,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4493,7 +4489,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -6459,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
bool seed)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
+
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
+ }
- while (fs_devices) {
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
@@ -6471,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
return device;
}
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6483,8 +6485,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6521,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
+ dev = __alloc_device(fs_info);
if (IS_ERR(dev))
return dev;
@@ -6717,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6739,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6746,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
@@ -7178,17 +7188,22 @@ error:
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
- fs_devices = fs_devices->seed;
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7214,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
-int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7232,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
}
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7485,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7583,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
+ struct btrfs_fs_devices *devs;
+
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
+ struct btrfs_fs_devices, seed_list);
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5eea93916fbf..bf27ac07d315 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,7 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
@@ -246,7 +246,7 @@ struct btrfs_fs_devices {
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
+ struct list_head seed_list;
bool seeding;
int opened;
@@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
@@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
diff --git a/fs/buffer.c b/fs/buffer.c
index 061dd202979d..23f645657488 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
if (retry)
gfp |= __GFP_NOFAIL;
memcg = get_mem_cgroup_from_page(page);
- memalloc_use_memcg(memcg);
+ old_memcg = set_active_memcg(memcg);
head = NULL;
offset = PAGE_SIZE;
@@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
set_bh_page(bh, page, offset);
}
out:
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return head;
/*
@@ -1958,7 +1958,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
*/
set_buffer_new(bh);
set_buffer_unwritten(bh);
- /* FALLTHRU */
+ fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) ||
offset >= i_size_read(inode))
@@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
-#if 0
- /* Not really sure about this - do we need this ? */
- if (page->mapping->a_ops->invalidatepage)
- page->mapping->a_ops->invalidatepage(page, offset);
-#endif
unlock_page(page);
return 0; /* don't care */
}
@@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
- do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
@@ -3157,6 +3141,15 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
+ /*
+ * The bh should be mapped, but it might not be if the
+ * device was hot-removed. Not much we can do but fail the I/O.
+ */
+ if (!buffer_mapped(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6ea761c84494..35c83f65475b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
return !PagePrivate(page);
}
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-static int ceph_sync_readpages(struct ceph_fs_client *fsc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages,
- int page_align)
-{
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, truncate_seq, truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0,
- pages, *plen, page_align, false, false);
-
- dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
- off, *plen, *plen, page_align);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-
-/*
- * read a single page, without unlocking it.
- */
+/* read a single page, without unlocking it. */
static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ struct ceph_vino vino = ceph_vino(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
@@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
if (err == 0)
return -EINPROGRESS;
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_sync_readpages(fsc, ceph_vino(inode),
- &ci->i_layout, off, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1, 0);
+ dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+ vino.ino, vino.snap, filp, off, len, page, page->index);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+
+ err = ceph_osdc_start_request(osdc, req, false);
+ if (!err)
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
+
+ ceph_osdc_put_request(req);
+ dout("readpage result %d\n", err);
+
if (err == -ENOENT)
err = 0;
if (err < 0) {
- SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out;
}
if (err < PAGE_SIZE)
@@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req)
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
- if (rc == -EBLACKLISTED)
- ceph_inode_to_client(inode)->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ ceph_inode_to_client(inode)->blocklisted = true;
/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -620,50 +591,6 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
- * do a synchronous write on N pages
- */
-static int ceph_sync_writepages(struct ceph_fs_client *fsc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_client *osdc = &fsc->client->osdc;
- struct ceph_osd_request *req;
- int rc = 0;
- int page_align = off & ~PAGE_MASK;
-
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq, truncate_size,
- true);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, false);
- dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
- req->r_mtime = *mtime;
- rc = ceph_osdc_start_request(osdc, req, true);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-
-/*
* Write a single page, but leave the page locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
@@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_fs_client *fsc;
+ struct inode *inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
- int err, len = PAGE_SIZE;
+ int err;
+ loff_t len = PAGE_SIZE;
struct ceph_writeback_ctl ceph_wbc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
dout("writepage %p idx %lu\n", page, page->index);
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- fsc = ceph_inode_to_client(inode);
-
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
@@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
- dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+ dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
inode, page, page->index, page_off, len, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_sync_writepages(fsc, ceph_vino(inode),
- &ci->i_layout, snapc, page_off, len,
- ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size,
- &inode->i_mtime, &page, 1);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
+ true);
+ if (IS_ERR(req)) {
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ return PTR_ERR(req);
+ }
+
+ /* it may be a short write due to an object boundary */
+ WARN_ON_ONCE(len > PAGE_SIZE);
+ osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+ dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+
+ req->r_mtime = inode->i_mtime;
+ err = ceph_osdc_start_request(osdc, req, true);
+ if (!err)
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
+
+ ceph_osdc_put_request(req);
+ if (err == 0)
+ err = len;
+
if (err < 0) {
struct writeback_control tmp_wbc;
if (!wbc)
@@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
mapping_set_error(&inode->i_data, err);
@@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
- if (rc == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (rc == -EBLOCKLISTED)
+ fsc->blocklisted = true;
} else {
ceph_clear_error_write(ci);
}
@@ -962,9 +910,8 @@ retry:
max_pages = wsize >> PAGE_SHIFT;
get_more_pages:
- pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
- end, PAGECACHE_TAG_DIRTY,
- max_pages - locked_pages);
+ pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_DIRTY);
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
@@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode,
return ret;
}
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
+/**
+ * ceph_find_incompatible - find an incompatible context and return it
+ * @page: page being dirtied
+ *
+ * We are only allowed to write into/dirty a page if the page is
+ * clean, or already dirty within the same snap context. Returns a
+ * conflicting context if there is one, NULL if there isn't, or a
+ * negative error code on other errors.
*
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
+ * Must be called with page lock held.
*/
-static int ceph_update_writeable_page(struct file *file,
- loff_t pos, unsigned len,
- struct page *page)
+static struct ceph_snap_context *
+ceph_find_incompatible(struct page *page)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = page->mapping->host;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- loff_t page_off = pos & PAGE_MASK;
- int pos_in_page = pos & ~PAGE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- int r;
- struct ceph_snap_context *snapc, *oldest;
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page);
- unlock_page(page);
- return -EIO;
+ return ERR_PTR(-EIO);
}
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
+ for (;;) {
+ struct ceph_snap_context *snapc, *oldest;
+
+ wait_on_page_writeback(page);
+
+ snapc = page_snap_context(page);
+ if (!snapc || snapc == ci->i_head_snapc)
+ break;
- snapc = page_snap_context(page);
- if (snapc && snapc != ci->i_head_snapc) {
/*
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
+ /* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n",
- page, snapc);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context(snapc);
- unlock_page(page);
- ceph_queue_writeback(inode);
- r = wait_event_killable(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r == -ERESTARTSYS)
- return r;
- return -EAGAIN;
+ dout(" page %p snapc %p not current or oldest\n", page, snapc);
+ return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_unlock;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
- }
-
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_SIZE)
- return 0;
-
- /* past end of file? */
- i_size = i_size_read(inode);
-
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_SIZE);
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_SIZE);
- return 0;
- }
-
- /* we need to read it. */
- r = ceph_do_readpage(file, page);
- if (r < 0) {
- if (r == -EINPROGRESS)
- return -EAGAIN;
- goto fail_unlock;
+ dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+ if (clear_page_dirty_for_io(page)) {
+ int r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ return ERR_PTR(r);
+ }
}
- goto retry_locked;
-fail_unlock:
- unlock_page(page);
- return r;
+ return NULL;
}
/*
@@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct page *page;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct page *page = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
- int r;
+ int pos_in_page = pos & ~PAGE_MASK;
+ int r = 0;
- do {
- /* get a page */
- page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
+ dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
+ for (;;) {
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page) {
+ r = -ENOMEM;
+ break;
+ }
- r = ceph_update_writeable_page(file, pos, len, page);
- if (r < 0)
+ snapc = ceph_find_incompatible(page);
+ if (snapc) {
+ if (IS_ERR(snapc)) {
+ r = PTR_ERR(snapc);
+ break;
+ }
+ unlock_page(page);
put_page(page);
- else
- *pagep = page;
- } while (r == -EAGAIN);
+ page = NULL;
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ if (r != 0)
+ break;
+ continue;
+ }
+
+ if (PageUptodate(page)) {
+ dout(" page %p already uptodate\n", page);
+ break;
+ }
+
+ /*
+ * In some cases we don't need to read at all:
+ * - full page write
+ * - write that lies completely beyond EOF
+ * - write that covers the the page from start to EOF or beyond it
+ */
+ if ((pos_in_page == 0 && len == PAGE_SIZE) ||
+ (pos >= i_size_read(inode)) ||
+ (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
+ zero_user_segments(page, 0, pos_in_page,
+ pos_in_page + len, PAGE_SIZE);
+ break;
+ }
+ /*
+ * We need to read it. If we get back -EINPROGRESS, then the page was
+ * handed off to fscache and it will be unlocked when the read completes.
+ * Refind the page in that case so we can reacquire the page lock. Otherwise
+ * we got a hard error or the read was completed synchronously.
+ */
+ r = ceph_do_readpage(file, page);
+ if (r != -EINPROGRESS)
+ break;
+ }
+
+ if (r < 0) {
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else {
+ *pagep = page;
+ }
return r;
}
@@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct page *pinned_page = NULL;
- loff_t off = vmf->pgoff << PAGE_SHIFT;
+ loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
inode_inc_iversion_raw(inode);
do {
+ struct ceph_snap_context *snapc;
+
lock_page(page);
if (page_mkwrite_check_truncate(page, inode) < 0) {
@@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
break;
}
- err = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (err >= 0) {
+ snapc = ceph_find_incompatible(page);
+ if (!snapc) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
+ break;
}
- } while (err == -EAGAIN);
+
+ unlock_page(page);
+
+ if (IS_ERR(snapc)) {
+ ret = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ ceph_queue_writeback(inode);
+ err = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ } while (err == 0);
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM) {
- if (err == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
goto out_unlock;
}
if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
- if (err2 == -EBLACKLISTED)
- fsc->blacklisted = true;
+ if (err2 == -EBLOCKLISTED)
+ fsc->blocklisted = true;
err = err2;
goto out_unlock;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..5027bbdca419 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+ dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
- " (mask %s)\n", ci->vfs_inode.i_ino,
+ dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+ " (mask %s)\n", ceph_ino(&ci->vfs_inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -1222,36 +1222,27 @@ struct cap_msg_args {
};
/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
+ * cap struct size + flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid
*/
-static int send_cap_msg(struct cap_msg_args *arg)
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
+
+/* Marshal up the cap msg to the MDS */
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
void *p;
- size_t extra_len;
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
- arg->cid, arg->ino, ceph_cap_string(arg->caps),
- ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
- arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
- arg->mseq, arg->follows, arg->size, arg->max_size,
- arg->xattr_version,
+ dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
+ __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
- /* flock buffer size + inline version + inline data size +
- * osd_epoch_barrier + oldest_flush_tid */
- extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
- GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
-
msg->hdr.version = cpu_to_le16(10);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
-
- ceph_con_send(&arg->session->s_con, msg);
- return 0;
}
/*
@@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
*
* Caller should hold snap_rwsem (read), s_mutex.
*/
-static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
- struct ceph_inode_info *ci)
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
+ struct ceph_msg *msg;
struct inode *inode = &ci->vfs_inode;
- int ret;
- ret = send_cap_msg(arg);
- if (ret < 0) {
- pr_err("error sending cap msg, ino (%llx.%llx) "
- "flushing %s tid %llu, requeue\n",
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg) {
+ pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
ceph_vinop(inode), ceph_cap_string(arg->dirty),
arg->flush_tid);
spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
+ return;
}
+ encode_cap_msg(msg, arg);
+ ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
-
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
u32 mseq, u64 oldest_flush_tid)
{
struct cap_msg_args arg;
+ struct ceph_msg *msg;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
arg.session = session;
arg.ino = ceph_vino(inode).ino;
@@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
arg.flags = 0;
arg.wake = false;
- return send_cap_msg(&arg);
+ encode_cap_msg(msg, &arg);
+ ceph_con_send(&arg.session->s_con, msg);
+ return 0;
}
/*
@@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
@@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
retry:
spin_lock(&ci->i_ceph_lock);
retry_locked:
+ /* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
+
+ /* Caps which have active references against them */
used = __ceph_caps_used(ci);
+
+ /*
+ * "issued" represents the current caps that the MDS wants us to have.
+ * "implemented" is the set that we have been granted, and includes the
+ * ones that have not yet been returned to the MDS (the "revoking" set,
+ * usually because they have outstanding references).
+ */
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
want = file_wanted;
+
+ /* The ones we currently want to retain (may be adjusted below) */
retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
if (file_wanted) {
@@ -2011,6 +2017,10 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
+ /*
+ * If we have an auth cap, we don't need to consider any
+ * overlapping caps as used.
+ */
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
@@ -2148,7 +2158,7 @@ ack:
want, retain, flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -2222,7 +2232,7 @@ retry_locked:
flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
- __send_cap(mdsc, &arg, ci);
+ __send_cap(&arg, ci);
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@@ -2872,7 +2882,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- cw.ino = inode->i_ino;
+ cw.ino = ceph_ino(inode);
cw.tgid = current->tgid;
cw.need = need;
cw.want = want;
@@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_opened = false;
+ int i;
+
+ if (count == 1)
+ atomic64_inc(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
ci->i_nr_by_mode[i] += count;
+
+ /*
+ * If any of the mode ref is larger than 1,
+ * that means it has been already opened by
+ * others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i] > 1)
+ is_opened = true;
}
+
+ if (!is_opened)
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
@@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- int i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
+ bool is_closed = true;
+ int i;
+
+ if (count == 1)
+ atomic64_dec(&mdsc->metric.opened_files);
+
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
BUG_ON(ci->i_nr_by_mode[i] < count);