aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c61
-rw-r--r--fs/9p/fid.h6
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/9p/vfs_file.c9
-rw-r--r--fs/9p/vfs_inode.c89
-rw-r--r--fs/9p/vfs_inode_dotl.c82
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c8
-rw-r--r--fs/Kconfig13
-rw-r--r--fs/Kconfig.binfmt35
-rw-r--r--fs/Makefile3
-rw-r--r--fs/affs/file.c6
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/cell.c61
-rw-r--r--fs/afs/cmservice.c4
-rw-r--r--fs/afs/dir.c23
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/fsclient.c2
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/internal.h19
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/proc.c6
-rw-r--r--fs/afs/rxrpc.c38
-rw-r--r--fs/afs/server.c46
-rw-r--r--fs/afs/vl_list.c19
-rw-r--r--fs/afs/volume.c21
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/yfsclient.c3
-rw-r--r--fs/aio.c47
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c76
-rw-r--r--fs/autofs/autofs_i.h7
-rw-r--r--fs/autofs/expire.c2
-rw-r--r--fs/autofs/inode.c1
-rw-r--r--fs/autofs/root.c108
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/befs/linuxvfs.c16
-rw-r--r--fs/binfmt_aout.c342
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c381
-rw-r--r--fs/btrfs/backref.h24
-rw-r--r--fs/btrfs/block-group.c253
-rw-r--r--fs/btrfs/block-group.h42
-rw-r--r--fs/btrfs/block-rsv.c24
-rw-r--r--fs/btrfs/block-rsv.h24
-rw-r--r--fs/btrfs/btrfs_inode.h46
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c436
-rw-r--r--fs/btrfs/compression.h20
-rw-r--r--fs/btrfs/ctree.c62
-rw-r--r--fs/btrfs/ctree.h468
-rw-r--r--fs/btrfs/delalloc-space.c19
-rw-r--r--fs/btrfs/delalloc-space.h3
-rw-r--r--fs/btrfs/delayed-inode.c733
-rw-r--r--fs/btrfs/delayed-inode.h45
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/dev-replace.c24
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/disk-io.c869
-rw-r--r--fs/btrfs/disk-io.h36
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/export.h2
-rw-r--r--fs/btrfs/extent-io-tree.c1674
-rw-r--r--fs/btrfs/extent-io-tree.h126
-rw-r--r--fs/btrfs/extent-tree.c273
-rw-r--r--fs/btrfs/extent_io.c3576
-rw-r--r--fs/btrfs/extent_io.h32
-rw-r--r--fs/btrfs/extent_map.c347
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c38
-rw-r--r--fs/btrfs/file.c970
-rw-r--r--fs/btrfs/free-space-cache.c118
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/free-space-tree.c8
-rw-r--r--fs/btrfs/inode.c1342
-rw-r--r--fs/btrfs/ioctl.c174
-rw-r--r--fs/btrfs/locking.c119
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c28
-rw-r--r--fs/btrfs/misc.h35
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h18
-rw-r--r--fs/btrfs/props.c5
-rw-r--r--fs/btrfs/qgroup.c96
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid56.c837
-rw-r--r--fs/btrfs/raid56.h166
-rw-r--r--fs/btrfs/reflink.c45
-rw-r--r--fs/btrfs/relocation.c49
-rw-r--r--fs/btrfs/root-tree.c19
-rw-r--r--fs/btrfs/scrub.c728
-rw-r--r--fs/btrfs/send.c1227
-rw-r--r--fs/btrfs/send.h185
-rw-r--r--fs/btrfs/space-info.c200
-rw-r--r--fs/btrfs/space-info.h15
-rw-r--r--fs/btrfs/struct-funcs.c11
-rw-r--r--fs/btrfs/subpage.c6
-rw-r--r--fs/btrfs/super.c166
-rw-r--r--fs/btrfs/sysfs.c358
-rw-r--r--fs/btrfs/tests/btrfs-tests.c29
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/extent-io-tests.c39
-rw-r--r--fs/btrfs/tests/free-space-tests.c22
-rw-r--r--fs/btrfs/tests/inode-tests.c10
-rw-r--r--fs/btrfs/tests/qgroup-tests.c36
-rw-r--r--fs/btrfs/transaction.c300
-rw-r--r--fs/btrfs/tree-checker.c25
-rw-r--r--fs/btrfs/tree-log.c1576
-rw-r--r--fs/btrfs/tree-log.h11
-rw-r--r--fs/btrfs/verity.c3
-rw-r--r--fs/btrfs/volumes.c638
-rw-r--r--fs/btrfs/volumes.h100
-rw-r--r--fs/btrfs/xattr.c3
-rw-r--r--fs/btrfs/zlib.c42
-rw-r--r--fs/btrfs/zoned.c483
-rw-r--r--fs/btrfs/zoned.h34
-rw-r--r--fs/btrfs/zstd.c33
-rw-r--r--fs/buffer.c581
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/namei.c122
-rw-r--r--fs/cachefiles/ondemand.c25
-rw-r--r--fs/ceph/addr.c72
-rw-r--r--fs/ceph/caps.c53
-rw-r--r--fs/ceph/dir.c79
-rw-r--r--fs/ceph/export.c3
-rw-r--r--fs/ceph/file.c132
-rw-r--r--fs/ceph/inode.c44
-rw-r--r--fs/ceph/mds_client.c176
-rw-r--r--fs/ceph/mds_client.h17
-rw-r--r--fs/ceph/mdsmap.c24
-rw-r--r--fs/ceph/super.c19
-rw-r--r--fs/ceph/super.h31
-rw-r--r--fs/ceph/xattr.c12
-rw-r--r--fs/cifs/Makefile6
-rw-r--r--fs/cifs/cached_dir.c558
-rw-r--r--fs/cifs/cached_dir.h80
-rw-r--r--fs/cifs/cifs_debug.c90
-rw-r--r--fs/cifs/cifs_debug.h6
-rw-r--r--fs/cifs/cifs_ioctl.h8
-rw-r--r--fs/cifs/cifs_swn.c12
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c109
-rw-r--r--fs/cifs/cifsfs.c118
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h320
-rw-r--r--fs/cifs/cifspdu.h7
-rw-r--r--fs/cifs/cifsproto.h38
-rw-r--r--fs/cifs/cifsroot.c2
-rw-r--r--fs/cifs/cifssmb.c481
-rw-r--r--fs/cifs/connect.c419
-rw-r--r--fs/cifs/dfs_cache.c10
-rw-r--r--fs/cifs/dir.c54
-rw-r--r--fs/cifs/file.c391
-rw-r--r--fs/cifs/fs_context.c21
-rw-r--r--fs/cifs/fs_context.h8
-rw-r--r--fs/cifs/fscache.c2
-rw-r--r--fs/cifs/fscache.h16
-rw-r--r--fs/cifs/inode.c244
-rw-r--r--fs/cifs/ioctl.c27
-rw-r--r--fs/cifs/link.c128
-rw-r--r--fs/cifs/misc.c134
-rw-r--r--fs/cifs/netlink.c1
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/readdir.c42
-rw-r--r--fs/cifs/sess.c231
-rw-r--r--fs/cifs/smb1ops.c66
-rw-r--r--fs/cifs/smb2file.c128
-rw-r--r--fs/cifs/smb2inode.c179
-rw-r--r--fs/cifs/smb2misc.c152
-rw-r--r--fs/cifs/smb2ops.c923
-rw-r--r--fs/cifs/smb2pdu.c222
-rw-r--r--fs/cifs/smb2pdu.h3
-rw-r--r--fs/cifs/smb2proto.h41
-rw-r--r--fs/cifs/smb2transport.c155
-rw-r--r--fs/cifs/smbdirect.c227
-rw-r--r--fs/cifs/smbdirect.h14
-rw-r--r--fs/cifs/trace.h3
-rw-r--r--fs/cifs/transport.c352
-rw-r--r--fs/cifs/xattr.c5
-rw-r--r--fs/coda/symlink.c11
-rw-r--r--fs/coredump.c86
-rw-r--r--fs/cramfs/inode.c17
-rw-r--r--fs/crypto/bio.c16
-rw-r--r--fs/crypto/fname.c36
-rw-r--r--fs/crypto/fscrypt_private.h93
-rw-r--r--fs/crypto/hooks.c16
-rw-r--r--fs/crypto/inline_crypt.c196
-rw-r--r--fs/crypto/keyring.c500
-rw-r--r--fs/crypto/keysetup.c96
-rw-r--r--fs/crypto/keysetup_v1.c4
-rw-r--r--fs/crypto/policy.c70
-rw-r--r--fs/d_path.c5
-rw-r--r--fs/dax.c408
-rw-r--r--fs/dcache.c136
-rw-r--r--fs/debugfs/file.c16
-rw-r--r--fs/debugfs/inode.c59
-rw-r--r--fs/direct-io.c49
-rw-r--r--fs/dlm/Kconfig9
-rw-r--r--fs/dlm/Makefile2
-rw-r--r--fs/dlm/ast.c19
-rw-r--r--fs/dlm/ast.h1
-rw-r--r--fs/dlm/config.c21
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dlm_internal.h34
-rw-r--r--fs/dlm/lock.c308
-rw-r--r--fs/dlm/lock.h19
-rw-r--r--fs/dlm/lockspace.c63
-rw-r--r--fs/dlm/lockspace.h13
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/member.c30
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c51
-rw-r--r--fs/dlm/recoverd.c35
-rw-r--r--fs/dlm/user.c38
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c40
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efivarfs/Makefile2
-rw-r--r--fs/efivarfs/internal.h40
-rw-r--r--fs/efivarfs/super.c15
-rw-r--r--fs/efivarfs/vars.c722
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/data.c39
-rw-r--r--fs/erofs/decompressor.c65
-rw-r--r--fs/erofs/decompressor_lzma.c4
-rw-r--r--fs/erofs/dir.c20
-rw-r--r--fs/erofs/erofs_fs.h40
-rw-r--r--fs/erofs/fscache.c480
-rw-r--r--fs/erofs/inode.c26
-rw-r--r--fs/erofs/internal.h86
-rw-r--r--fs/erofs/namei.c13
-rw-r--r--fs/erofs/super.c106
-rw-r--r--fs/erofs/sysfs.c23
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/xattr.h2
-rw-r--r--fs/erofs/zdata.c860
-rw-r--r--fs/erofs/zdata.h119
-rw-r--r--fs/erofs/zmap.c138
-rw-r--r--fs/erofs/zpvec.h159
-rw-r--r--fs/eventfd.c10
-rw-r--r--fs/eventpoll.c24
-rw-r--r--fs/exec.c53
-rw-r--r--fs/exfat/dir.c6
-rw-r--r--fs/exfat/exfat_fs.h19
-rw-r--r--fs/exfat/fatent.c5
-rw-r--r--fs/exfat/file.c82
-rw-r--r--fs/exfat/inode.c43
-rw-r--r--fs/exfat/misc.c17
-rw-r--r--fs/exfat/namei.c26
-rw-r--r--fs/exfat/nls.c4
-rw-r--r--fs/exfat/super.c4
-rw-r--r--fs/exportfs/expfs.c7
-rw-r--r--fs/ext2/balloc.c7
-rw-r--r--fs/ext2/dir.c20
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/ialloc.c3
-rw-r--r--fs/ext2/inode.c61
-rw-r--r--fs/ext2/namei.c16
-rw-r--r--fs/ext2/super.c53
-rw-r--r--fs/ext2/xattr.c170
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h40
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/extents_status.c6
-rw-r--r--fs/ext4/fast_commit.c261
-rw-r--r--fs/ext4/fast_commit.h3
-rw-r--r--fs/ext4/file.c43
-rw-r--r--fs/ext4/ialloc.c9
-rw-r--r--fs/ext4/indirect.c4
-rw-r--r--fs/ext4/inline.c33
-rw-r--r--fs/ext4/inode.c138
-rw-r--r--fs/ext4/ioctl.c119
-rw-r--r--fs/ext4/mballoc.c348
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/migrate.c10
-rw-r--r--fs/ext4/mmp.c13
-rw-r--r--fs/ext4/move_extent.c26
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/orphan.c24
-rw-r--r--fs/ext4/readpage.c10
-rw-r--r--fs/ext4/resize.c46
-rw-r--r--fs/ext4/super.c1286
-rw-r--r--fs/ext4/symlink.c15
-rw-r--r--fs/ext4/verity.c9
-rw-r--r--fs/ext4/xattr.c169
-rw-r--r--fs/ext4/xattr.h16
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c69
-rw-r--r--fs/f2fs/compress.c284
-rw-r--r--fs/f2fs/data.c200
-rw-r--r--fs/f2fs/debug.c11
-rw-r--r--fs/f2fs/dir.c1
-rw-r--r--fs/f2fs/extent_cache.c9
-rw-r--r--fs/f2fs/f2fs.h198
-rw-r--r--fs/f2fs/file.c199
-rw-r--r--fs/f2fs/gc.c53
-rw-r--r--fs/f2fs/gc.h21
-rw-r--r--fs/f2fs/inline.c17
-rw-r--r--fs/f2fs/inode.c54
-rw-r--r--fs/f2fs/iostat.c105
-rw-r--r--fs/f2fs/iostat.h4
-rw-r--r--fs/f2fs/namei.c32
-rw-r--r--fs/f2fs/node.c33
-rw-r--r--fs/f2fs/recovery.c39
-rw-r--r--fs/f2fs/segment.c118
-rw-r--r--fs/f2fs/segment.h13
-rw-r--r--fs/f2fs/super.c212
-rw-r--r--fs/f2fs/sysfs.c65
-rw-r--r--fs/f2fs/verity.c15
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fat/dir.c8
-rw-r--r--fs/fat/file.c14
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/namei_vfat.c231
-rw-r--r--fs/fcntl.c1
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c1
-rw-r--r--fs/file_table.c26
-rw-r--r--fs/freevxfs/vxfs_immed.c43
-rw-r--r--fs/freevxfs/vxfs_subr.c6
-rw-r--r--fs/fs-writeback.c49
-rw-r--r--fs/fscache/cookie.c31
-rw-r--r--fs/fscache/volume.c4
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/dax.c2
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/dir.c40
-rw-r--r--fs/fuse/file.c50
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/fuse/ioctl.c15
-rw-r--r--fs/fuse/readdir.c10
-rw-r--r--fs/fuse/virtio_fs.c9
-rw-r--r--fs/gfs2/aops.c28
-rw-r--r--fs/gfs2/bmap.c5
-rw-r--r--fs/gfs2/dir.c7
-rw-r--r--fs/gfs2/export.c6
-rw-r--r--fs/gfs2/file.c34
-rw-r--r--fs/gfs2/glock.c459
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/glops.c31
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/inode.c13
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/log.c9
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c32
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/main.c27
-rw-r--r--fs/gfs2/meta_io.c19
-rw-r--r--fs/gfs2/ops_fstype.c33
-rw-r--r--fs/gfs2/quota.c36
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/rgrp.h5
-rw-r--r--fs/gfs2/super.c5
-rw-r--r--fs/gfs2/util.c12
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/bnode.c36
-rw-r--r--fs/hfs/btree.c29
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/bnode.c109
-rw-r--r--fs/hfsplus/btree.c27
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/wrapper.c12
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/hugetlbfs/inode.c516
-rw-r--r--fs/inode.c221
-rw-r--r--fs/internal.h15
-rw-r--r--fs/io-wq.c1424
-rw-r--r--fs/io-wq.h228
-rw-r--r--fs/io_uring.c13253
-rw-r--r--fs/iomap/buffered-io.c128
-rw-r--r--fs/iomap/direct-io.c33
-rw-r--r--fs/iomap/trace.h1
-rw-r--r--fs/isofs/compress.c22
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/jbd2/checkpoint.c6
-rw-r--r--fs/jbd2/commit.c46
-rw-r--r--fs/jbd2/journal.c99
-rw-r--r--fs/jbd2/recovery.c47
-rw-r--r--fs/jbd2/revoke.c8
-rw-r--r--fs/jbd2/transaction.c46
-rw-r--r--fs/jffs2/wbuf.c6
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c18
-rw-r--r--fs/jfs/jfs_metapage.c2
-rw-r--r--fs/kernel_read_file.c38
-rw-r--r--fs/kernfs/dir.c114
-rw-r--r--fs/kernfs/file.c236
-rw-r--r--fs/kernfs/kernfs-internal.h5
-rw-r--r--fs/kernfs/mount.c19
-rw-r--r--fs/ksmbd/auth.c71
-rw-r--r--fs/ksmbd/auth.h14
-rw-r--r--fs/ksmbd/connection.c17
-rw-r--r--fs/ksmbd/connection.h12
-rw-r--r--fs/ksmbd/ksmbd_netlink.h5
-rw-r--r--fs/ksmbd/mgmt/share_config.c56
-rw-r--r--fs/ksmbd/mgmt/share_config.h7
-rw-r--r--fs/ksmbd/mgmt/tree_connect.c23
-rw-r--r--fs/ksmbd/mgmt/tree_connect.h4
-rw-r--r--fs/ksmbd/mgmt/user_session.c95
-rw-r--r--fs/ksmbd/mgmt/user_session.h13
-rw-r--r--fs/ksmbd/misc.c46
-rw-r--r--fs/ksmbd/misc.h5
-rw-r--r--fs/ksmbd/ndr.c8
-rw-r--r--fs/ksmbd/oplock.c73
-rw-r--r--fs/ksmbd/server.c12
-rw-r--r--fs/ksmbd/smb2misc.c12
-rw-r--r--fs/ksmbd/smb2pdu.c337
-rw-r--r--fs/ksmbd/smb2pdu.h7
-rw-r--r--fs/ksmbd/smb_common.c6
-rw-r--r--fs/ksmbd/smb_common.h2
-rw-r--r--fs/ksmbd/smbacl.c142
-rw-r--r--fs/ksmbd/smbacl.h20
-rw-r--r--fs/ksmbd/transport_ipc.c1
-rw-r--r--fs/ksmbd/transport_rdma.c18
-rw-r--r--fs/ksmbd/transport_tcp.c5
-rw-r--r--fs/ksmbd/unicode.h3
-rw-r--r--fs/ksmbd/vfs.c72
-rw-r--r--fs/ksmbd/vfs.h6
-rw-r--r--fs/ksmbd/vfs_cache.c2
-rw-r--r--fs/libfs.c46
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/svc4proc.c36
-rw-r--r--fs/lockd/svclock.c10
-rw-r--r--fs/lockd/svcproc.c29
-rw-r--r--fs/lockd/svcsubs.c14
-rw-r--r--fs/lockd/xdr4.c19
-rw-r--r--fs/locks.c78
-rw-r--r--fs/mbcache.c128
-rw-r--r--fs/minix/namei.c6
-rw-r--r--fs/mount.h1
-rw-r--r--fs/mpage.c131
-rw-r--r--fs/namei.c365
-rw-r--r--fs/namespace.c9
-rw-r--r--fs/netfs/buffered_read.c17
-rw-r--r--fs/nfs/blocklayout/blocklayout.c13
-rw-r--r--fs/nfs/blocklayout/dev.c42
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/client.c17
-rw-r--r--fs/nfs/delegation.c36
-rw-r--r--fs/nfs/dir.c84
-rw-r--r--fs/nfs/direct.c58
-rw-r--r--fs/nfs/dns_resolve.c7
-rw-r--r--fs/nfs/dns_resolve.h2
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c113
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c6
-rw-r--r--fs/nfs/fs_context.c40
-rw-r--r--fs/nfs/inode.c19
-rw-r--r--fs/nfs/internal.h100
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3client.c5
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs42proc.c18
-rw-r--r--fs/nfs/nfs42xattr.c9
-rw-r--r--fs/nfs/nfs42xdr.c178
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4client.c25
-rw-r--r--fs/nfs/nfs4file.c6
-rw-r--r--fs/nfs/nfs4idmap.c48
-rw-r--r--fs/nfs/nfs4namespace.c16
-rw-r--r--fs/nfs/nfs4proc.c79
-rw-r--r--fs/nfs/nfs4state.c16
-rw-r--r--fs/nfs/nfs4trace.h50
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h215
-rw-r--r--fs/nfs/pnfs.c14
-rw-r--r--fs/nfs/pnfs.h9
-rw-r--r--fs/nfs/pnfs_nfs.c10
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/super.c34
-rw-r--r--fs/nfs/write.c105
-rw-r--r--fs/nfsd/acl.h6
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/filecache.c790
-rw-r--r--fs/nfsd/filecache.h13
-rw-r--r--fs/nfsd/netns.h7
-rw-r--r--fs/nfsd/nfs2acl.c11
-rw-r--r--fs/nfsd/nfs3acl.c7
-rw-r--r--fs/nfsd/nfs3proc.c78
-rw-r--r--fs/nfsd/nfs3xdr.c18
-rw-r--r--fs/nfsd/nfs4acl.c46
-rw-r--r--fs/nfsd/nfs4callback.c51
-rw-r--r--fs/nfsd/nfs4idmap.c8
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c552
-rw-r--r--fs/nfsd/nfs4recover.c22
-rw-r--r--fs/nfsd/nfs4state.c344
-rw-r--r--fs/nfsd/nfs4xdr.c234
-rw-r--r--fs/nfsd/nfscache.c16
-rw-r--r--fs/nfsd/nfsctl.c63
-rw-r--r--fs/nfsd/nfsd.h18
-rw-r--r--fs/nfsd/nfsfh.c35
-rw-r--r--fs/nfsd/nfsfh.h58
-rw-r--r--fs/nfsd/nfsproc.c64
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/nfsxdr.c4
-rw-r--r--fs/nfsd/state.h12
-rw-r--r--fs/nfsd/stats.c14
-rw-r--r--fs/nfsd/trace.h452
-rw-r--r--fs/nfsd/vfs.c414
-rw-r--r--fs/nfsd/vfs.h35
-rw-r--r--fs/nfsd/xdr4.h69
-rw-r--r--fs/nilfs2/btnode.c8
-rw-r--r--fs/nilfs2/btnode.h4
-rw-r--r--fs/nilfs2/btree.c12
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/gcinode.c7
-rw-r--r--fs/nilfs2/inode.c19
-rw-r--r--fs/nilfs2/mdt.c19
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/nilfs2/page.c105
-rw-r--r--fs/nilfs2/segment.c40
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/notify/fanotify/fanotify.c21
-rw-r--r--fs/notify/fanotify/fanotify.h10
-rw-r--r--fs/notify/fanotify/fanotify_user.c142
-rw-r--r--fs/notify/fdinfo.c6
-rw-r--r--fs/notify/fsnotify.c23
-rw-r--r--fs/notify/fsnotify.h4
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/aops.c12
-rw-r--r--fs/ntfs/aops.h7
-rw-r--r--fs/ntfs/attrib.c28
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/file.c9
-rw-r--r--fs/ntfs/inode.c7
-rw-r--r--fs/ntfs/logfile.c2
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ntfs/super.c3
-rw-r--r--fs/ntfs3/attrib.c557
-rw-r--r--fs/ntfs3/bitmap.c16
-rw-r--r--fs/ntfs3/file.c112
-rw-r--r--fs/ntfs3/frecord.c128
-rw-r--r--fs/ntfs3/fslog.c10
-rw-r--r--fs/ntfs3/fsntfs.c94
-rw-r--r--fs/ntfs3/index.c33
-rw-r--r--fs/ntfs3/inode.c36
-rw-r--r--fs/ntfs3/namei.c6
-rw-r--r--fs/ntfs3/ntfs_fs.h25
-rw-r--r--fs/ntfs3/record.c27
-rw-r--r--fs/ntfs3/run.c108
-rw-r--r--fs/ntfs3/super.c17
-rw-r--r--fs/ntfs3/xattr.c147
-rw-r--r--fs/ocfs2/aops.c30
-rw-r--r--fs/ocfs2/buffer_head_io.c8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c75
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/heartbeat.c27
-rw-r--r--fs/ocfs2/journal.c14
-rw-r--r--fs/ocfs2/namei.c24
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/ocfs2_fs.h8
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/refcounttree.c44
-rw-r--r--fs/ocfs2/slot_map.c46
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/stackglue.c4
-rw-r--r--fs/ocfs2/suballoc.h2
-rw-r--r--fs/ocfs2/super.c30
-rw-r--r--fs/open.c76
-rw-r--r--fs/orangefs/dir.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c4
-rw-r--r--fs/overlayfs/copy_up.c123
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c2
-rw-r--r--fs/overlayfs/inode.c100
-rw-r--r--fs/overlayfs/namei.c8
-rw-r--r--fs/overlayfs/overlayfs.h55
-rw-r--r--fs/overlayfs/readdir.c44
-rw-r--r--fs/overlayfs/super.c46
-rw-r--r--fs/overlayfs/util.c10
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/posix_acl.c376
-rw-r--r--fs/proc/Kconfig1
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c70
-rw-r--r--fs/proc/devices.c6
-rw-r--r--fs/proc/inode.c22
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/kmsg.c3
-rw-r--r--fs/proc/loadavg.c6
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/proc_net.c9
-rw-r--r--fs/proc/proc_sysctl.c9
-rw-r--r--fs/proc/proc_tty.c2
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/task_mmu.c108
-rw-r--r--fs/proc/task_nommu.c45
-rw-r--r--fs/proc/uptime.c6
-rw-r--r--fs/proc/version.c6
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/zone.c12
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/quota/dquot.c19
-rw-r--r--fs/quota/quota_tree.c73
-rw-r--r--fs/ramfs/file-nommu.c50
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c129
-rw-r--r--fs/readdir.c68
-rw-r--r--fs/reiserfs/inode.c20
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/prints.c2
-rw-r--r--fs/reiserfs/procfs.c4
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c29
-rw-r--r--fs/remap_range.c45
-rw-r--r--fs/smbfs_common/smb2pdu.h6
-rw-r--r--fs/splice.c64
-rw-r--r--fs/squashfs/Makefile4
-rw-r--r--fs/squashfs/block.c10
-rw-r--r--fs/squashfs/decompressor.h1
-rw-r--r--fs/squashfs/file.c153
-rw-r--r--fs/squashfs/file_direct.c90
-rw-r--r--fs/squashfs/lz4_wrapper.c7
-rw-r--r--fs/squashfs/lzo_wrapper.c7
-rw-r--r--fs/squashfs/page_actor.c56
-rw-r--r--fs/squashfs/page_actor.h66
-rw-r--r--fs/squashfs/super.c33
-rw-r--r--fs/squashfs/xz_wrapper.c11
-rw-r--r--fs/squashfs/zlib_wrapper.c12
-rw-r--r--fs/squashfs/zstd_wrapper.c12
-rw-r--r--fs/stat.c14
-rw-r--r--fs/super.c42
-rw-r--r--fs/tracefs/inode.c31
-rw-r--r--fs/ubifs/crypto.c11
-rw-r--r--fs/ubifs/debug.c10
-rw-r--r--fs/ubifs/dir.c34
-rw-r--r--fs/ubifs/file.c29
-rw-r--r--fs/ubifs/journal.c30
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/inode.c8
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/util.c11
-rw-r--r--fs/userfaultfd.c162
-rw-r--r--fs/verity/Kconfig10
-rw-r--r--fs/verity/fsverity_private.h2
-rw-r--r--fs/verity/read_metadata.c6
-rw-r--r--fs/verity/verify.c14
-rw-r--r--fs/xattr.c24
-rw-r--r--fs/xfs/Makefile6
-rw-r--r--fs/xfs/libxfs/xfs_ag.c171
-rw-r--r--fs/xfs/libxfs/xfs_ag.h90
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c149
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h58
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_attr.c60
-rw-r--r--fs/xfs/libxfs/xfs_attr.h15
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c63
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c15
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c86
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c52
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c9
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c12
-rw-r--r--fs/xfs/libxfs/xfs_format.h24
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c90
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h25
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c20
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c15
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c69
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h27
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h60
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c301
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h40
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c20
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c13
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c6
-rw-r--r--fs/xfs/libxfs/xfs_types.c73
-rw-r--r--fs/xfs/libxfs/xfs_types.h39
-rw-r--r--fs/xfs/scrub/agheader.c25
-rw-r--r--fs/xfs/scrub/agheader_repair.c21
-rw-r--r--fs/xfs/scrub/alloc.c7
-rw-r--r--fs/xfs/scrub/bmap.c16
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c6
-rw-r--r--fs/xfs/scrub/dabtree.c2
-rw-r--r--fs/xfs/scrub/dir.c12
-rw-r--r--fs/xfs/scrub/fscounters.c4
-rw-r--r--fs/xfs/scrub/health.c2
-rw-r--r--fs/xfs/scrub/ialloc.c13
-rw-r--r--fs/xfs/scrub/parent.c4
-rw-r--r--fs/xfs/scrub/quota.c2
-rw-r--r--fs/xfs/scrub/refcount.c77
-rw-r--r--fs/xfs/scrub/repair.c49
-rw-r--r--fs/xfs/scrub/rmap.c6
-rw-r--r--fs/xfs/scrub/symlink.c6
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_inactive.c23
-rw-r--r--fs/xfs/xfs_attr_item.c100
-rw-r--r--fs/xfs/xfs_attr_list.c9
-rw-r--r--fs/xfs/xfs_bio_io.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c54
-rw-r--r--fs/xfs/xfs_bmap_util.c39
-rw-r--r--fs/xfs/xfs_buf.c305
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_error.c11
-rw-r--r--fs/xfs/xfs_extfree_item.c100
-rw-r--r--fs/xfs/xfs_extfree_item.h16
-rw-r--r--fs/xfs/xfs_file.c69
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsmap.c3
-rw-r--r--fs/xfs/xfs_fsops.c16
-rw-r--r--fs/xfs/xfs_icache.c74
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c772
-rw-r--r--fs/xfs/xfs_inode.h70
-rw-r--r--fs/xfs/xfs_inode_item.c60
-rw-r--r--fs/xfs/xfs_inode_item_recover.c4
-rw-r--r--fs/xfs/xfs_ioctl.c10
-rw-r--r--fs/xfs/xfs_iomap.c49
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c59
-rw-r--r--fs/xfs/xfs_iops.h4
-rw-r--r--fs/xfs/xfs_itable.c12
-rw-r--r--fs/xfs/xfs_iunlink_item.c180
-rw-r--r--fs/xfs/xfs_iunlink_item.h27
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c90
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_cil.c472
-rw-r--r--fs/xfs/xfs_log_priv.h58
-rw-r--r--fs/xfs/xfs_log_recover.c208
-rw-r--r--fs/xfs/xfs_mount.c41
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_notify_failure.c234
-rw-r--r--fs/xfs/xfs_ondisk.h23
-rw-r--r--fs/xfs/xfs_qm.c17
-rw-r--r--fs/xfs/xfs_qm_syscalls.c9
-rw-r--r--fs/xfs/xfs_refcount_item.c57
-rw-r--r--fs/xfs/xfs_reflink.c278
-rw-r--r--fs/xfs/xfs_reflink.h3
-rw-r--r--fs/xfs/xfs_rmap_item.c70
-rw-r--r--fs/xfs/xfs_stats.c4
-rw-r--r--fs/xfs/xfs_super.c70
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_symlink.c2
-rw-r--r--fs/xfs/xfs_sysfs.h7
-rw-r--r--fs/xfs/xfs_trace.h56
-rw-r--r--fs/xfs/xfs_trans.c95
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_ail.c11
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/zonefs/super.c52
-rw-r--r--fs/zonefs/trace.h4
792 files changed, 35577 insertions, 39586 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index baf2b152229e..23cf9b2fbfe4 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -28,14 +28,18 @@ static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
/**
* v9fs_fid_add - add a fid to a dentry
* @dentry: dentry that the fid is being added to
- * @fid: fid to add
+ * @pfid: fid to add, NULLed out
*
*/
-void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid)
{
+ struct p9_fid *fid = *pfid;
+
spin_lock(&dentry->d_lock);
__add_fid(dentry, fid);
spin_unlock(&dentry->d_lock);
+
+ *pfid = NULL;
}
/**
@@ -56,7 +60,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
h = (struct hlist_head *)&inode->i_private;
hlist_for_each_entry(fid, h, ilist) {
if (uid_eq(fid->uid, uid)) {
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
ret = fid;
break;
}
@@ -68,15 +72,19 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
/**
* v9fs_open_fid_add - add an open fid to an inode
* @inode: inode that the fid is being added to
- * @fid: fid to add
+ * @pfid: fid to add, NULLed out
*
*/
-void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid)
+void v9fs_open_fid_add(struct inode *inode, struct p9_fid **pfid)
{
+ struct p9_fid *fid = *pfid;
+
spin_lock(&inode->i_lock);
hlist_add_head(&fid->ilist, (struct hlist_head *)&inode->i_private);
spin_unlock(&inode->i_lock);
+
+ *pfid = NULL;
}
@@ -104,7 +112,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
hlist_for_each_entry(fid, h, dlist) {
if (any || uid_eq(fid->uid, uid)) {
ret = fid;
- refcount_inc(&ret->count);
+ p9_fid_get(ret);
break;
}
}
@@ -150,9 +158,9 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
{
struct dentry *ds;
const unsigned char **wnames, *uname;
- int i, n, l, clone, access;
+ int i, n, l, access;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *old_fid;
+ struct p9_fid *fid, *root_fid, *old_fid;
v9ses = v9fs_dentry2v9ses(dentry);
access = v9ses->flags & V9FS_ACCESS_MASK;
@@ -169,17 +177,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = v9fs_fid_find(ds, uid, any);
if (fid) {
/* Found the parent fid do a lookup with that */
- struct p9_fid *ofid = fid;
+ old_fid = fid;
- fid = p9_client_walk(ofid, 1, &dentry->d_name.name, 1);
- p9_client_clunk(ofid);
+ fid = p9_client_walk(old_fid, 1, &dentry->d_name.name, 1);
+ p9_fid_put(old_fid);
goto fid_out;
}
up_read(&v9ses->rename_sem);
/* start from the root and try to do a lookup */
- fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
- if (!fid) {
+ root_fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+ if (!root_fid) {
/* the user is not attached to the fs yet */
if (access == V9FS_ACCESS_SINGLE)
return ERR_PTR(-EPERM);
@@ -194,12 +202,13 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
if (IS_ERR(fid))
return fid;
- refcount_inc(&fid->count);
- v9fs_fid_add(dentry->d_sb->s_root, fid);
+ root_fid = p9_fid_get(fid);
+ v9fs_fid_add(dentry->d_sb->s_root, &fid);
}
/* If we are root ourself just return that */
if (dentry->d_sb->s_root == dentry)
- return fid;
+ return root_fid;
+
/*
* Do a multipath walk with attached root.
* When walking parent we need to make sure we
@@ -211,19 +220,20 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = ERR_PTR(n);
goto err_out;
}
- old_fid = fid;
- clone = 1;
+ fid = root_fid;
+ old_fid = root_fid;
i = 0;
while (i < n) {
l = min(n - i, P9_MAXWELEM);
/*
* We need to hold rename lock when doing a multipath
- * walk to ensure none of the patch component change
+ * walk to ensure none of the path components change
*/
- fid = p9_client_walk(fid, l, &wnames[i], clone);
+ fid = p9_client_walk(old_fid, l, &wnames[i],
+ old_fid == root_fid /* clone */);
/* non-cloning walk will return the same fid */
if (fid != old_fid) {
- p9_client_clunk(old_fid);
+ p9_fid_put(old_fid);
old_fid = fid;
}
if (IS_ERR(fid)) {
@@ -231,7 +241,6 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
goto err_out;
}
i += l;
- clone = 0;
}
kfree(wnames);
fid_out:
@@ -239,11 +248,11 @@ fid_out:
spin_lock(&dentry->d_lock);
if (d_unhashed(dentry)) {
spin_unlock(&dentry->d_lock);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = ERR_PTR(-ENOENT);
} else {
__add_fid(dentry, fid);
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
spin_unlock(&dentry->d_lock);
}
}
@@ -300,7 +309,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
fid = clone_fid(ofid);
if (IS_ERR(fid))
goto error_out;
- p9_client_clunk(ofid);
+ p9_fid_put(ofid);
/*
* writeback fid will only be used to write back the
* dirty pages. We always request for the open fid in read-write
@@ -309,7 +318,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
*/
err = p9_client_open(fid, O_RDWR);
if (err < 0) {
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = ERR_PTR(err);
goto error_out;
}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index f7f33509e169..8a4e8cd12ca2 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -13,9 +13,9 @@ static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
{
return v9fs_fid_lookup(dentry->d_parent);
}
-void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid);
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
-void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid);
+void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid);
static inline struct p9_fid *clone_fid(struct p9_fid *fid)
{
return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1);
@@ -29,7 +29,7 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
return fid;
nfid = clone_fid(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return nfid;
}
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index d0833fa69faf..47b9a1122f34 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -73,7 +73,7 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
BUG_ON(!fid);
}
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
rreq->netfs_priv = fid;
return 0;
}
@@ -86,7 +86,7 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
{
struct p9_fid *fid = rreq->netfs_priv;
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
/**
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 1c609e99d280..f89f01734587 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -54,7 +54,7 @@ static void v9fs_dentry_release(struct dentry *dentry)
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
- p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+ p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
dentry->d_fsdata = NULL;
}
@@ -85,7 +85,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
retval = v9fs_refresh_inode_dotl(fid, inode);
else
retval = v9fs_refresh_inode(fid, inode);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval == -ENOENT)
return 0;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 958680f7f23e..000fbaae9b18 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -218,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_lock(&inode->i_lock);
hlist_del(&fid->ilist);
spin_unlock(&inode->i_lock);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
if ((filp->f_mode & FMODE_WRITE)) {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2573c08f335c..aec43ba83799 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -63,15 +63,16 @@ int v9fs_file_open(struct inode *inode, struct file *file)
err = p9_client_open(fid, omode);
if (err < 0) {
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return err;
}
if ((file->f_flags & O_APPEND) &&
(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
+
+ file->private_data = fid;
}
- file->private_data = fid;
mutex_lock(&v9inode->v_mutex);
if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
!v9inode->writeback_fid &&
@@ -95,10 +96,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, fid);
+ v9fs_open_fid_add(inode, &fid);
return 0;
out_error:
- p9_client_clunk(file->private_data);
+ p9_fid_put(file->private_data);
file->private_data = NULL;
return err;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3d8297714772..4d1a4a8d9277 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -399,10 +399,8 @@ void v9fs_evict_inode(struct inode *inode)
fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
/* clunk the fid stashed in writeback_fid */
- if (v9inode->writeback_fid) {
- p9_client_clunk(v9inode->writeback_fid);
- v9inode->writeback_fid = NULL;
- }
+ p9_fid_put(v9inode->writeback_fid);
+ v9inode->writeback_fid = NULL;
}
static int v9fs_test_inode(struct inode *inode, void *data)
@@ -569,7 +567,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
if (v9fs_proto_dotl(v9ses))
retval = p9_client_unlinkat(dfid, dentry->d_name.name,
v9fs_at_to_dotl_flags(flags));
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
if (retval == -EOPNOTSUPP) {
/* Try the one based on path */
v9fid = v9fs_fid_clone(dentry);
@@ -633,14 +631,12 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- p9_client_clunk(dfid);
- return ERR_PTR(err);
+ goto error;
}
err = p9_client_fcreate(ofid, name, perm, mode, extension);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
- p9_client_clunk(dfid);
goto error;
}
@@ -651,8 +647,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS,
"p9_client_walk failed %d\n", err);
- fid = NULL;
- p9_client_clunk(dfid);
goto error;
}
/*
@@ -663,21 +657,17 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
"inode creation failed %d\n", err);
- p9_client_clunk(dfid);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
}
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return ofid;
error:
- if (ofid)
- p9_client_clunk(ofid);
-
- if (fid)
- p9_client_clunk(fid);
-
+ p9_fid_put(dfid);
+ p9_fid_put(ofid);
+ p9_fid_put(fid);
return ERR_PTR(err);
}
@@ -708,7 +698,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -744,7 +734,7 @@ static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return err;
}
@@ -785,7 +775,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
*/
name = dentry->d_name.name;
fid = p9_client_walk(dfid, 1, &name, 1);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
if (fid == ERR_PTR(-ENOENT))
inode = NULL;
else if (IS_ERR(fid))
@@ -804,11 +794,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
res = d_splice_alias(inode, dentry);
if (!IS_ERR(fid)) {
if (!res)
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
else if (!IS_ERR(res))
- v9fs_fid_add(res, fid);
+ v9fs_fid_add(res, &fid);
else
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
return res;
}
@@ -847,7 +837,6 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_proto_dotu(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- fid = NULL;
goto error;
}
@@ -882,7 +871,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, fid);
+ v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
out:
@@ -890,8 +879,7 @@ out:
return err;
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
goto out;
}
@@ -939,9 +927,9 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct inode *old_inode;
struct inode *new_inode;
struct v9fs_session_info *v9ses;
- struct p9_fid *oldfid, *dfid;
- struct p9_fid *olddirfid;
- struct p9_fid *newdirfid;
+ struct p9_fid *oldfid = NULL, *dfid = NULL;
+ struct p9_fid *olddirfid = NULL;
+ struct p9_fid *newdirfid = NULL;
struct p9_wstat wstat;
if (flags)
@@ -958,21 +946,22 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
dfid = v9fs_parent_fid(old_dentry);
olddirfid = clone_fid(dfid);
- if (dfid && !IS_ERR(dfid))
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
+ dfid = NULL;
if (IS_ERR(olddirfid)) {
retval = PTR_ERR(olddirfid);
- goto done;
+ goto error;
}
dfid = v9fs_parent_fid(new_dentry);
newdirfid = clone_fid(dfid);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
+ dfid = NULL;
if (IS_ERR(newdirfid)) {
retval = PTR_ERR(newdirfid);
- goto clunk_olddir;
+ goto error;
}
down_write(&v9ses->rename_sem);
@@ -983,7 +972,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = p9_client_rename(oldfid, newdirfid,
new_dentry->d_name.name);
if (retval != -EOPNOTSUPP)
- goto clunk_newdir;
+ goto error_locked;
}
if (old_dentry->d_parent != new_dentry->d_parent) {
/*
@@ -992,14 +981,14 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
retval = -EXDEV;
- goto clunk_newdir;
+ goto error_locked;
}
v9fs_blank_wstat(&wstat);
wstat.muid = v9ses->uname;
wstat.name = new_dentry->d_name.name;
retval = p9_client_wstat(oldfid, &wstat);
-clunk_newdir:
+error_locked:
if (!retval) {
if (new_inode) {
if (S_ISDIR(new_inode->i_mode))
@@ -1020,13 +1009,11 @@ clunk_newdir:
d_move(old_dentry, new_dentry);
}
up_write(&v9ses->rename_sem);
- p9_client_clunk(newdirfid);
-
-clunk_olddir:
- p9_client_clunk(olddirfid);
-done:
- p9_client_clunk(oldfid);
+error:
+ p9_fid_put(newdirfid);
+ p9_fid_put(olddirfid);
+ p9_fid_put(oldfid);
return retval;
}
@@ -1060,7 +1047,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
return PTR_ERR(fid);
st = p9_client_stat(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return PTR_ERR(st);
@@ -1136,7 +1123,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
retval = p9_client_wstat(fid, &wstat);
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval < 0)
return retval;
@@ -1261,7 +1248,7 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
return ERR_CAST(fid);
st = p9_client_stat(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return ERR_CAST(st);
@@ -1308,7 +1295,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -1364,7 +1351,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
v9fs_refresh_inode(oldfid, d_inode(old_dentry));
v9fs_invalidate_inode_attr(dir);
}
- p9_client_clunk(oldfid);
+ p9_fid_put(oldfid);
return retval;
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index b6eb1160296c..5cfa4b4f070f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -238,7 +238,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct inode *inode;
struct p9_fid *fid = NULL;
struct v9fs_inode *v9inode;
- struct p9_fid *dfid, *ofid, *inode_fid;
+ struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
struct dentry *res = NULL;
@@ -274,7 +274,6 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- p9_client_clunk(dfid);
goto out;
}
@@ -286,38 +285,34 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err) {
p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
err);
- p9_client_clunk(dfid);
- goto error;
+ goto out;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
mode, gid, &qid);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
err);
- p9_client_clunk(dfid);
- goto error;
+ goto out;
}
v9fs_invalidate_inode_attr(dir);
/* instantiate inode and assign the unopened fid to the dentry */
fid = p9_client_walk(dfid, 1, &name, 1);
- p9_client_clunk(dfid);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- fid = NULL;
- goto error;
+ goto out;
}
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
- goto error;
+ goto out;
}
/* Now set the ACL based on the default value */
v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
v9inode = V9FS_I(inode);
@@ -336,7 +331,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode_fid)) {
err = PTR_ERR(inode_fid);
mutex_unlock(&v9inode->v_mutex);
- goto err_clunk_old_fid;
+ goto out;
}
v9inode->writeback_fid = (void *) inode_fid;
}
@@ -344,25 +339,20 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
/* Since we are opening a file, assign the open fid to the file */
err = finish_open(file, dentry, generic_file_open);
if (err)
- goto err_clunk_old_fid;
+ goto out;
file->private_data = ofid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, ofid);
+ v9fs_open_fid_add(inode, &ofid);
file->f_mode |= FMODE_CREATED;
out:
+ p9_fid_put(dfid);
+ p9_fid_put(ofid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
dput(res);
return err;
-
-error:
- if (fid)
- p9_client_clunk(fid);
-err_clunk_old_fid:
- if (ofid)
- p9_client_clunk(ofid);
- goto out;
}
/**
@@ -400,7 +390,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- dfid = NULL;
goto error;
}
@@ -422,7 +411,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -435,10 +423,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
err);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/*
@@ -457,10 +444,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
inc_nlink(dir);
v9fs_invalidate_inode_attr(dir);
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return err;
}
@@ -489,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns,
*/
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return PTR_ERR(st);
@@ -603,7 +589,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
retval = p9_client_setattr(fid, &p9attr);
if (retval < 0) {
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return retval;
}
@@ -619,12 +605,12 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
retval = v9fs_acl_chmod(inode, fid);
if (retval < 0) {
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return retval;
}
}
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -743,7 +729,6 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -755,9 +740,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/* Not in cached mode. No need to populate inode with stat */
@@ -770,10 +754,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
}
error:
- if (fid)
- p9_client_clunk(fid);
-
- p9_client_clunk(dfid);
+ p9_fid_put(fid);
+ p9_fid_put(dfid);
return err;
}
@@ -803,14 +785,14 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
oldfid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(oldfid)) {
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return PTR_ERR(oldfid);
}
err = p9_client_link(dfid, oldfid, dentry->d_name.name);
- p9_client_clunk(dfid);
- p9_client_clunk(oldfid);
+ p9_fid_put(dfid);
+ p9_fid_put(oldfid);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
return err;
@@ -826,7 +808,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
return PTR_ERR(fid);
v9fs_refresh_inode_dotl(fid, d_inode(old_dentry));
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
ihold(d_inode(old_dentry));
d_instantiate(dentry, d_inode(old_dentry));
@@ -866,7 +848,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- dfid = NULL;
goto error;
}
@@ -891,7 +872,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -905,9 +885,8 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
goto error;
}
v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/*
@@ -923,10 +902,9 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
d_instantiate(dentry, inode);
}
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return err;
}
@@ -956,7 +934,7 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval)
return ERR_PTR(retval);
set_delayed_call(done, kfree_link, target);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 97e23b4e6982..2d9ee073d12c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -184,13 +184,13 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = v9fs_get_acl(inode, fid);
if (retval)
goto release_sb;
- v9fs_fid_add(root, fid);
+ v9fs_fid_add(root, &fid);
p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
return dget(sb->s_root);
clunk_fid:
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_session_close(v9ses);
free_session:
kfree(v9ses);
@@ -203,7 +203,7 @@ release_sb:
* attached the fid to dentry so it won't get clunked
* automatically.
*/
- p9_client_clunk(fid);
+ p9_fid_put(fid);
deactivate_locked_super(sb);
return ERR_PTR(retval);
}
@@ -270,7 +270,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
res = simple_statfs(dentry, buf);
done:
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return res;
}
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index a824441b95a2..1f9298a4bd42 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -44,7 +44,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
if (err)
retval = err;
}
- p9_client_clunk(attr_fid);
+ p9_fid_put(attr_fid);
return retval;
}
@@ -71,7 +71,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
if (IS_ERR(fid))
return PTR_ERR(fid);
ret = v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return ret;
}
@@ -98,7 +98,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
if (IS_ERR(fid))
return PTR_ERR(fid);
ret = v9fs_fid_xattr_set(fid, name, value, value_len, flags);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return ret;
}
@@ -128,7 +128,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
retval);
else
p9_client_write(fid, 0, &from, &retval);
- err = p9_client_clunk(fid);
+ err = p9_fid_put(fid);
if (!retval && err)
retval = err;
return retval;
diff --git a/fs/Kconfig b/fs/Kconfig
index 5976eb33535f..2685a4d0d353 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on (SYSFS || SYSCTL)
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -247,8 +248,7 @@ config HUGETLB_PAGE
#
# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of minimizing overhead of struct page associated with
-# each HugeTLB page.
+# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
#
config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
bool
@@ -259,14 +259,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
- bool "Default optimizing vmemmap pages of HugeTLB to on"
+ bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
default n
depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
help
- When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
- pages associated with each HugeTLB page is default off. Say Y here
- to enable optimizing vmemmap pages of HugeTLB by default. It can then
- be disabled on the command line via hugetlb_free_vmemmap=off.
+ The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
+ enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+ (boot command line) or hugetlb_optimize_vmemmap (sysctl).
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 21e154516bf2..93539aac0e5b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on ARM || ((M68K || SUPERH) && !MMU)
+ depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
@@ -142,39 +142,6 @@ config BINFMT_ZFLAT
help
Support FLAT format compressed binaries
-config HAVE_AOUT
- def_bool n
-
-config BINFMT_AOUT
- tristate "Kernel support for a.out and ECOFF binaries"
- depends on HAVE_AOUT
- help
- A.out (Assembler.OUTput) is a set of formats for libraries and
- executables used in the earliest versions of UNIX. Linux used
- the a.out formats QMAGIC and ZMAGIC until they were replaced
- with the ELF format.
-
- The conversion to ELF started in 1995. This option is primarily
- provided for historical interest and for the benefit of those
- who need to run binaries from that era.
-
- Most people should answer N here. If you think you may have
- occasional use for this format, enable module support above
- and answer M here to compile this support as a module called
- binfmt_aout.
-
- If any crucial components of your system (such as /sbin/init
- or /lib/ld.so) are still in a.out format, you will have to
- say Y here.
-
-config OSF4_COMPAT
- bool "OSF/1 v4 readv/writev compatibility"
- depends on ALPHA && BINFMT_AOUT
- help
- Say Y if you are using OSF/1 binaries (like Netscape and Acrobat)
- with v4 shared libraries freely available from Compaq. If you're
- going to use shared libraries from Tru64 version 5.0 or later, say N.
-
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
help
diff --git a/fs/Makefile b/fs/Makefile
index 208a74e0b00e..4dea17840761 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -34,13 +34,10 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
-obj-$(CONFIG_IO_URING) += io_uring.o
-obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
diff --git a/fs/affs/file.c b/fs/affs/file.c
index cd00a4c68a12..cefa222f7881 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -526,7 +526,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
- char *data;
unsigned pos = 0;
u32 bidx, boff, bsize;
u32 tmp;
@@ -545,15 +544,12 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
- data = kmap_atomic(page);
- memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
- kunmap_atomic(data);
+ memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
pos += tmp;
boff = 0;
}
- flush_dcache_page(page);
return 0;
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4c5f30a83336..58b391446ae1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -276,7 +276,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
char *vol = match_strdup(&args[0]);
if (!vol)
return 0;
- strlcpy(volume, vol, 32);
+ strscpy(volume, vol, 32);
kfree(vol);
break;
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 07ad744eef77..988c2ac7cece 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -158,7 +158,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name[i] = tolower(name[i]);
cell->name[i] = 0;
- atomic_set(&cell->ref, 1);
+ refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
INIT_WORK(&cell->manager, afs_manage_cell_work);
cell->volumes = RB_ROOT;
@@ -287,7 +287,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
atomic_set(&cell->active, 2);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert);
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
@@ -295,7 +295,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
afs_queue_cell(cell, afs_cell_trace_get_queue_new);
wait_for_cell:
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
afs_cell_trace_wait);
_debug("wait_for_cell");
wait_var_event(&cell->state,
@@ -490,13 +490,13 @@ static void afs_cell_destroy(struct rcu_head *rcu)
{
struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
struct afs_net *net = cell->net;
- int u;
+ int r;
_enter("%p{%s}", cell, cell->name);
- u = atomic_read(&cell->ref);
- ASSERTCMP(u, ==, 0);
- trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free);
+ r = refcount_read(&cell->ref);
+ ASSERTCMP(r, ==, 0);
+ trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
@@ -539,13 +539,10 @@ void afs_cells_timer(struct timer_list *timer)
*/
struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u;
+ int r;
- if (atomic_read(&cell->ref) <= 0)
- BUG();
-
- u = atomic_inc_return(&cell->ref);
- trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason);
+ __refcount_inc(&cell->ref, &r);
+ trace_afs_cell(cell->debug_id, r + 1, atomic_read(&cell->active), reason);
return cell;
}
@@ -556,12 +553,14 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
if (cell) {
unsigned int debug_id = cell->debug_id;
- unsigned int u, a;
+ unsigned int a;
+ bool zero;
+ int r;
a = atomic_read(&cell->active);
- u = atomic_dec_return(&cell->ref);
- trace_afs_cell(debug_id, u, a, reason);
- if (u == 0) {
+ zero = __refcount_dec_and_test(&cell->ref, &r);
+ trace_afs_cell(debug_id, r - 1, a, reason);
+ if (zero) {
a = atomic_read(&cell->active);
WARN(a != 0, "Cell active count %u > 0\n", a);
call_rcu(&cell->rcu, afs_cell_destroy);
@@ -574,14 +573,12 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u, a;
-
- if (atomic_read(&cell->ref) <= 0)
- BUG();
+ int r, a;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
+ WARN_ON(r == 0);
a = atomic_inc_return(&cell->active);
- trace_afs_cell(cell->debug_id, u, a, reason);
+ trace_afs_cell(cell->debug_id, r, a, reason);
return cell;
}
@@ -593,7 +590,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
{
unsigned int debug_id;
time64_t now, expire_delay;
- int u, a;
+ int r, a;
if (!cell)
return;
@@ -607,9 +604,9 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
expire_delay = afs_cell_gc_delay;
debug_id = cell->debug_id;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
a = atomic_dec_return(&cell->active);
- trace_afs_cell(debug_id, u, a, reason);
+ trace_afs_cell(debug_id, r, a, reason);
WARN_ON(a == 0);
if (a == 1)
/* 'cell' may now be garbage collected. */
@@ -621,11 +618,11 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
*/
void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u, a;
+ int r, a;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
a = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, u, a, reason);
+ trace_afs_cell(cell->debug_id, r, a, reason);
}
/*
@@ -739,7 +736,7 @@ again:
active = 1;
if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
rb_erase(&cell->net_node, &net->cells);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0,
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
afs_cell_trace_unuse_delete);
smp_store_release(&cell->state, AFS_CELL_REMOVED);
}
@@ -866,7 +863,7 @@ void afs_manage_cells(struct work_struct *work)
bool sched_cell = false;
active = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
active, afs_cell_trace_manage);
ASSERTCMP(active, >=, 1);
@@ -874,7 +871,7 @@ void afs_manage_cells(struct work_struct *work)
if (purging) {
if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
active = atomic_dec_return(&cell->active);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
active, afs_cell_trace_unuse_pin);
}
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3f5de28be79..0a090d614e76 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -212,8 +212,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
* to maintain cache coherency.
*/
if (call->server) {
- trace_afs_server(call->server,
- atomic_read(&call->server->ref),
+ trace_afs_server(call->server->debug_id,
+ refcount_read(&call->server->ref),
atomic_read(&call->server->active),
afs_server_trace_callback);
afs_break_callbacks(call->server, call->count, call->request);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 56ae5cd5184f..230c2d19116d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_iput(struct dentry *dentry, struct inode *inode);
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
@@ -568,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_one_cookie *cookie =
@@ -584,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
if (cookie->name.len != nlen ||
memcmp(cookie->name.name, name, nlen) != 0) {
- _leave(" = 0 [no]");
- return 0;
+ _leave(" = true [keep looking]");
+ return true;
}
cookie->fid.vnode = ino;
cookie->fid.unique = dtype;
cookie->found = 1;
- _leave(" = -1 [found]");
- return -1;
+ _leave(" = false [found]");
+ return false;
}
/*
@@ -636,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_cookie *cookie =
container_of(ctx, struct afs_lookup_cookie, ctx);
- int ret;
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
@@ -663,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
cookie->fids[1].unique = dtype;
cookie->found = 1;
if (cookie->one_only)
- return -1;
+ return false;
}
- ret = cookie->nr_fids >= 50 ? -1 : 0;
- _leave(" = %d", ret);
- return ret;
+ return cookie->nr_fids < 50;
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 42118a4f3383..d1cfb235c4b9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq)
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index c4210a3964d8..bbcc5afd1576 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -76,7 +76,7 @@ void afs_lock_op_done(struct afs_call *call)
if (call->error == 0) {
spin_lock(&vnode->lock);
trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
- vnode->locked_at = call->reply_time;
+ vnode->locked_at = call->issue_time;
afs_schedule_lock_extension(vnode);
spin_unlock(&vnode->lock);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 4943413d9c5f..7d37f63ef0f0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -131,7 +131,7 @@ bad:
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
{
- return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
+ return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry;
}
static void xdr_decode_AFSCallBack(const __be32 **_bp,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 64dab70d4a4f..6d3a3dbe4928 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -104,12 +104,14 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
inode->i_mapping->a_ops = &afs_file_aops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations;
inode->i_mapping->a_ops = &afs_dir_aops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case AFS_FTYPE_SYMLINK:
/* Symlinks with a mode of 0644 are actually mountpoints. */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6f25d9e75b5..723d162078a3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -122,7 +122,7 @@ struct afs_call {
};
struct afs_operation *op;
unsigned int server_index;
- atomic_t usage;
+ refcount_t ref;
enum afs_call_state state;
spinlock_t state_lock;
int error; /* error code */
@@ -137,7 +137,6 @@ struct afs_call {
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
- bool have_reply_time; /* T if have got reply_time */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
u16 service_id; /* Actual service ID (after upgrade) */
@@ -151,7 +150,7 @@ struct afs_call {
} __attribute__((packed));
__be64 tmp64;
};
- ktime_t reply_time; /* Time of first reply packet */
+ ktime_t issue_time; /* Time of issue of operation */
};
struct afs_call_type {
@@ -365,7 +364,7 @@ struct afs_cell {
struct hlist_node proc_link; /* /proc cell list link */
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
- atomic_t ref; /* Struct refcount */
+ refcount_t ref; /* Struct refcount */
atomic_t active; /* Active usage counter */
unsigned long flags;
#define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */
@@ -410,7 +409,7 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
- atomic_t usage;
+ refcount_t ref;
unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
@@ -446,7 +445,7 @@ struct afs_vlserver_entry {
struct afs_vlserver_list {
struct rcu_head rcu;
- atomic_t usage;
+ refcount_t ref;
u8 nr_servers;
u8 index; /* Server currently in use */
u8 preferred; /* Preferred server */
@@ -517,7 +516,7 @@ struct afs_server {
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
- atomic_t ref; /* Object refcount */
+ refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
unsigned int rtt; /* Server's current RTT in uS */
@@ -571,7 +570,7 @@ struct afs_volume {
struct rcu_head rcu;
afs_volid_t vid; /* volume ID */
};
- atomic_t usage;
+ refcount_t ref;
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
@@ -1493,14 +1492,14 @@ extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
*/
static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
{
- atomic_inc(&vlserver->usage);
+ refcount_inc(&vlserver->ref);
return vlserver;
}
static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
{
if (vllist)
- atomic_inc(&vllist->usage);
+ refcount_inc(&vllist->ref);
return vllist;
}
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 933e67fcdab1..805328ca5428 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -69,6 +69,7 @@ int afs_abort_to_error(u32 abort_code)
/* Unified AFS error table */
case UAEPERM: return -EPERM;
case UAENOENT: return -ENOENT;
+ case UAEAGAIN: return -EAGAIN;
case UAEACCES: return -EACCES;
case UAEBUSY: return -EBUSY;
case UAEEXIST: return -EEXIST;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index bbb2c210d139..97f50e9fd9eb 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -132,12 +132,6 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (IS_ERR(page))
return PTR_ERR(page);
- if (PageError(page)) {
- ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- put_page(page);
- return ret;
- }
-
buf = kmap(page);
ret = -EINVAL;
if (buf[size - 1] == '.')
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e1b863449296..2a0c83d71565 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -47,7 +47,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
/* display one cell per line on subsequent lines */
seq_printf(m, "%3u %3u %6lld %2u %2u %s\n",
- atomic_read(&cell->ref),
+ refcount_read(&cell->ref),
atomic_read(&cell->active),
cell->dns_expiry - ktime_get_real_seconds(),
vllist ? vllist->nr_servers : 0,
@@ -217,7 +217,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
}
seq_printf(m, "%3d %08llx %s %s\n",
- atomic_read(&vol->usage), vol->vid,
+ refcount_read(&vol->ref), vol->vid,
afs_vol_types[vol->type],
vol->name);
@@ -388,7 +388,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
alist = rcu_dereference(server->addresses);
seq_printf(m, "%pU %3d %3d\n",
&server->uuid,
- atomic_read(&server->ref),
+ refcount_read(&server->ref),
atomic_read(&server->active));
seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n",
server->flags, server->rtt, server->cb_s_break);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index a5434f3e57c6..eccc3cd0cb70 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -145,14 +145,14 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->type = type;
call->net = net;
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
- atomic_set(&call->usage, 1);
+ refcount_set(&call->ref, 1);
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
call->iter = &call->def_iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
- trace_afs_call(call, afs_call_trace_alloc, 1, o,
+ trace_afs_call(call->debug_id, afs_call_trace_alloc, 1, o,
__builtin_return_address(0));
return call;
}
@@ -163,14 +163,16 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
void afs_put_call(struct afs_call *call)
{
struct afs_net *net = call->net;
- int n = atomic_dec_return(&call->usage);
- int o = atomic_read(&net->nr_outstanding_calls);
+ unsigned int debug_id = call->debug_id;
+ bool zero;
+ int r, o;
- trace_afs_call(call, afs_call_trace_put, n, o,
+ zero = __refcount_dec_and_test(&call->ref, &r);
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
__builtin_return_address(0));
- ASSERTCMP(n, >=, 0);
- if (n == 0) {
+ if (zero) {
ASSERT(!work_pending(&call->async_work));
ASSERT(call->type->name != NULL);
@@ -185,7 +187,7 @@ void afs_put_call(struct afs_call *call)
afs_put_addrlist(call->alist);
kfree(call->request);
- trace_afs_call(call, afs_call_trace_free, 0, o,
+ trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
__builtin_return_address(0));
kfree(call);
@@ -198,9 +200,11 @@ void afs_put_call(struct afs_call *call)
static struct afs_call *afs_get_call(struct afs_call *call,
enum afs_call_trace why)
{
- int u = atomic_inc_return(&call->usage);
+ int r;
- trace_afs_call(call, why, u,
+ __refcount_inc(&call->ref, &r);
+
+ trace_afs_call(call->debug_id, why, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
return call;
@@ -347,6 +351,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (call->max_lifespan)
rxrpc_kernel_set_max_life(call->net->socket, rxcall,
call->max_lifespan);
+ call->issue_time = ktime_get_real();
/* send the request */
iov[0].iov_base = call->request;
@@ -497,12 +502,6 @@ static void afs_deliver_to_call(struct afs_call *call)
return;
}
- if (!call->have_reply_time &&
- rxrpc_kernel_get_reply_time(call->net->socket,
- call->rxcall,
- &call->reply_time))
- call->have_reply_time = true;
-
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
if (ret == 0 && call->unmarshalling_error)
@@ -668,14 +667,13 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long call_user_ID)
{
struct afs_call *call = (struct afs_call *)call_user_ID;
- int u;
+ int r;
trace_afs_notify_call(rxcall, call);
call->need_attention = true;
- u = atomic_fetch_add_unless(&call->usage, 1, 0);
- if (u != 0) {
- trace_afs_call(call, afs_call_trace_wake, u + 1,
+ if (__refcount_inc_not_zero(&call->ref, &r)) {
+ trace_afs_call(call->debug_id, afs_call_trace_wake, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 6e5b9a19b234..4981baf97835 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -228,7 +228,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
if (!server)
goto enomem;
- atomic_set(&server->ref, 1);
+ refcount_set(&server->ref, 1);
atomic_set(&server->active, 1);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
RCU_INIT_POINTER(server->addresses, alist);
@@ -243,7 +243,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server->rtt = UINT_MAX;
afs_inc_servers_outstanding(net);
- trace_afs_server(server, 1, 1, afs_server_trace_alloc);
+ trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
_leave(" = %p", server);
return server;
@@ -352,9 +352,12 @@ void afs_servers_timer(struct timer_list *timer)
struct afs_server *afs_get_server(struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int u = atomic_inc_return(&server->ref);
+ unsigned int a;
+ int r;
- trace_afs_server(server, u, atomic_read(&server->active), reason);
+ __refcount_inc(&server->ref, &r);
+ a = atomic_read(&server->active);
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -364,14 +367,14 @@ struct afs_server *afs_get_server(struct afs_server *server,
static struct afs_server *afs_maybe_use_server(struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int r = atomic_fetch_add_unless(&server->ref, 1, 0);
unsigned int a;
+ int r;
- if (r == 0)
+ if (!__refcount_inc_not_zero(&server->ref, &r))
return NULL;
a = atomic_inc_return(&server->active);
- trace_afs_server(server, r, a, reason);
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -380,10 +383,13 @@ static struct afs_server *afs_maybe_use_server(struct afs_server *server,
*/
struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
{
- unsigned int r = atomic_inc_return(&server->ref);
- unsigned int a = atomic_inc_return(&server->active);
+ unsigned int a;
+ int r;
- trace_afs_server(server, r, a, reason);
+ __refcount_inc(&server->ref, &r);
+ a = atomic_inc_return(&server->active);
+
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -393,14 +399,17 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
void afs_put_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int usage;
+ unsigned int a, debug_id = server->debug_id;
+ bool zero;
+ int r;
if (!server)
return;
- usage = atomic_dec_return(&server->ref);
- trace_afs_server(server, usage, atomic_read(&server->active), reason);
- if (unlikely(usage == 0))
+ a = atomic_inc_return(&server->active);
+ zero = __refcount_dec_and_test(&server->ref, &r);
+ trace_afs_server(debug_id, r - 1, a, reason);
+ if (unlikely(zero))
__afs_put_server(net, server);
}
@@ -436,7 +445,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
{
struct afs_server *server = container_of(rcu, struct afs_server, rcu);
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
atomic_read(&server->active), afs_server_trace_free);
afs_put_addrlist(rcu_access_pointer(server->addresses));
kfree(server);
@@ -487,7 +496,7 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
active = atomic_read(&server->active);
if (active == 0) {
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
active, afs_server_trace_gc);
next = rcu_dereference_protected(
server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
@@ -553,7 +562,7 @@ void afs_manage_servers(struct work_struct *work)
_debug("manage %pU %u", &server->uuid, active);
if (purging) {
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
active, afs_server_trace_purging);
if (active != 0)
pr_notice("Can't purge s=%08x\n", server->debug_id);
@@ -633,7 +642,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
_enter("");
- trace_afs_server(server, atomic_read(&server->ref), atomic_read(&server->active),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
+ atomic_read(&server->active),
afs_server_trace_update);
alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 38b2ba1d9ec0..acc48216136a 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -17,7 +17,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
GFP_KERNEL);
if (vlserver) {
- atomic_set(&vlserver->usage, 1);
+ refcount_set(&vlserver->ref, 1);
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
@@ -39,13 +39,9 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
{
- if (vlserver) {
- unsigned int u = atomic_dec_return(&vlserver->usage);
- //_debug("VL PUT %p{%u}", vlserver, u);
-
- if (u == 0)
- call_rcu(&vlserver->rcu, afs_vlserver_rcu);
- }
+ if (vlserver &&
+ refcount_dec_and_test(&vlserver->ref))
+ call_rcu(&vlserver->rcu, afs_vlserver_rcu);
}
struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
@@ -54,7 +50,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
if (vllist) {
- atomic_set(&vllist->usage, 1);
+ refcount_set(&vllist->ref, 1);
rwlock_init(&vllist->lock);
}
@@ -64,10 +60,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
{
if (vllist) {
- unsigned int u = atomic_dec_return(&vllist->usage);
-
- //_debug("VLLS PUT %p{%u}", vllist, u);
- if (u == 0) {
+ if (refcount_dec_and_test(&vllist->ref)) {
int i;
for (i = 0; i < vllist->nr_servers; i++) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index cc665cef0abe..f4937029dcd7 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -52,7 +52,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
struct afs_cell *cell = volume->cell;
if (!hlist_unhashed(&volume->proc_link)) {
- trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+ trace_afs_volume(volume->vid, refcount_read(&cell->ref),
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
@@ -87,7 +87,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type_force = params->force;
volume->name_len = vldb->name_len;
- atomic_set(&volume->usage, 1);
+ refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
rwlock_init(&volume->servers_lock);
rwlock_init(&volume->cb_v_break_lock);
@@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(net, rcu_access_pointer(volume->servers));
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
- trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+ trace_afs_volume(volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
@@ -242,8 +242,10 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
enum afs_volume_trace reason)
{
if (volume) {
- int u = atomic_inc_return(&volume->usage);
- trace_afs_volume(volume->vid, u, reason);
+ int r;
+
+ __refcount_inc(&volume->ref, &r);
+ trace_afs_volume(volume->vid, r + 1, reason);
}
return volume;
}
@@ -257,9 +259,12 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
{
if (volume) {
afs_volid_t vid = volume->vid;
- int u = atomic_dec_return(&volume->usage);
- trace_afs_volume(vid, u, reason);
- if (u == 0)
+ bool zero;
+ int r;
+
+ zero = __refcount_dec_and_test(&volume->ref, &r);
+ trace_afs_volume(vid, r - 1, reason);
+ if (zero)
afs_destroy_volume(net, volume);
}
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2c885b22de34..9ebdd36eaf2f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -91,7 +91,7 @@ try_again:
goto flush_conflicting_write;
}
- *_page = &folio->page;
+ *_page = folio_file_page(folio, pos / PAGE_SIZE);
_leave(" = 0");
return 0;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index fdc7d675b4b0..11571cca86c1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -232,8 +232,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp,
struct afs_callback *cb = &scb->callback;
ktime_t cb_expiry;
- cb_expiry = call->reply_time;
- cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+ cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100);
cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC);
scb->have_cb = true;
*_bp += xdr_size(x);
diff --git a/fs/aio.c b/fs/aio.c
index 3c249b938632..5b2ff20ad322 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -400,8 +400,8 @@ static const struct file_operations aio_ring_fops = {
};
#if IS_ENABLED(CONFIG_MIGRATION)
-static int aio_migratepage(struct address_space *mapping, struct page *new,
- struct page *old, enum migrate_mode mode)
+static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
struct kioctx *ctx;
unsigned long flags;
@@ -435,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out;
}
- idx = old->index;
+ idx = src->index;
if (idx < (pgoff_t)ctx->nr_pages) {
- /* Make sure the old page hasn't already been changed */
- if (ctx->ring_pages[idx] != old)
+ /* Make sure the old folio hasn't already been changed */
+ if (ctx->ring_pages[idx] != &src->page)
rc = -EAGAIN;
} else
rc = -EINVAL;
@@ -447,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out_unlock;
/* Writeback must be complete */
- BUG_ON(PageWriteback(old));
- get_page(new);
+ BUG_ON(folio_test_writeback(src));
+ folio_get(dst);
- rc = migrate_page_move_mapping(mapping, new, old, 1);
+ rc = folio_migrate_mapping(mapping, dst, src, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
- put_page(new);
+ folio_put(dst);
goto out_unlock;
}
/* Take completion_lock to prevent other writes to the ring buffer
- * while the old page is copied to the new. This prevents new
+ * while the old folio is copied to the new. This prevents new
* events from being lost.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- BUG_ON(ctx->ring_pages[idx] != old);
- ctx->ring_pages[idx] = new;
+ folio_migrate_copy(dst, src);
+ BUG_ON(ctx->ring_pages[idx] != &src->page);
+ ctx->ring_pages[idx] = &dst->page;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- /* The old page is no longer accessible. */
- put_page(old);
+ /* The old folio is no longer accessible. */
+ folio_put(src);
out_unlock:
mutex_unlock(&ctx->ring_lock);
@@ -475,13 +475,13 @@ out:
spin_unlock(&mapping->private_lock);
return rc;
}
+#else
+#define aio_migrate_folio NULL
#endif
static const struct address_space_operations aio_ctx_aops = {
.dirty_folio = noop_dirty_folio,
-#if IS_ENABLED(CONFIG_MIGRATION)
- .migratepage = aio_migratepage,
-#endif
+ .migrate_folio = aio_migrate_folio,
};
static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
@@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx)
local_irq_save(flags);
kcpu = this_cpu_ptr(ctx->cpu);
if (!kcpu->reqs_available) {
- int old, avail = atomic_read(&ctx->reqs_available);
+ int avail = atomic_read(&ctx->reqs_available);
do {
if (avail < ctx->req_batch)
goto out;
-
- old = avail;
- avail = atomic_cmpxchg(&ctx->reqs_available,
- avail, avail - ctx->req_batch);
- } while (avail != old);
+ } while (!atomic_try_cmpxchg(&ctx->reqs_available,
+ &avail, avail - ctx->req_batch));
kcpu->reqs_available += ctx->req_batch;
}
@@ -1475,7 +1472,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = iocb_flags(req->ki_filp);
+ req->ki_flags = req->ki_filp->f_iocb_flags;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e0c3e33c4177..24192a7667ed 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -32,7 +32,7 @@ static struct inode *anon_inode_inode;
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
+ return dynamic_dname(buffer, buflen, "anon_inode:%s",
dentry->d_name.name);
}
diff --git a/fs/attr.c b/fs/attr.c
index dbe996b0dedf..1552a5f23d6b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -22,7 +22,7 @@
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @uid: uid to chown @inode to
+ * @ia_vfsuid: uid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -31,15 +31,15 @@
* performed on the raw inode simply passs init_user_ns.
*/
static bool chown_ok(struct user_namespace *mnt_userns,
- const struct inode *inode,
- kuid_t uid)
+ const struct inode *inode, vfsuid_t ia_vfsuid)
{
- kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
+ vfsuid_eq(ia_vfsuid, vfsuid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (uid_eq(kuid, INVALID_UID) &&
+ if (!vfsuid_valid(vfsuid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* chgrp_ok - verify permissions to chgrp inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @gid: gid to chown @inode to
+ * @ia_vfsgid: gid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* performed on the raw inode simply passs init_user_ns.
*/
static bool chgrp_ok(struct user_namespace *mnt_userns,
- const struct inode *inode, kgid_t gid)
+ const struct inode *inode, vfsgid_t ia_vfsgid)
{
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
- kgid_t mapped_gid;
-
- if (gid_eq(gid, inode->i_gid))
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
+ if (vfsgid_eq(ia_vfsgid, vfsgid))
return true;
- mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
- if (in_group_p(mapped_gid))
+ if (vfsgid_in_group_p(ia_vfsgid))
return true;
}
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (gid_eq(kgid, INVALID_GID) &&
+ if (!vfsgid_valid(vfsgid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
goto kill_priv;
/* Make sure a caller can chown. */
- if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
+ if ((ia_valid & ATTR_UID) &&
+ !chown_ok(mnt_userns, inode, attr->ia_vfsuid))
return -EPERM;
/* Make sure caller can chgrp. */
- if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
+ if ((ia_valid & ATTR_GID) &&
+ !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid))
return -EPERM;
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
- kgid_t mapped_gid;
+ vfsgid_t vfsgid;
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
if (ia_valid & ATTR_GID)
- mapped_gid = mapped_kgid_fs(mnt_userns,
- i_user_ns(inode), attr->ia_gid);
+ vfsgid = attr->ia_vfsgid;
else
- mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!in_group_p(mapped_gid) &&
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -184,6 +183,8 @@ EXPORT_SYMBOL(setattr_prepare);
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
+ if (offset < 0)
+ return -EINVAL;
if (inode->i_size < offset) {
unsigned long limit;
@@ -219,9 +220,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
* setattr_copy must be called with i_mutex held.
*
* setattr_copy updates the inode's metadata with that specified
- * in attr on idmapped mounts. If file ownership is changed setattr_copy
- * doesn't map ia_uid and ia_gid. It will asssume the caller has already
- * provided the intended values. Necessary permission checks to determine
+ * in attr on idmapped mounts. Necessary permission checks to determine
* whether or not the S_ISGID property needs to be removed are performed with
* the correct idmapped mount permission helpers.
* Noticeably missing is inode size update, which is more complex
@@ -242,10 +241,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -254,8 +251,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (!in_group_p(kgid) &&
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
@@ -306,9 +303,6 @@ EXPORT_SYMBOL(may_setattr);
* retry. Because breaking a delegation may take a long time, the
* caller should drop the i_mutex before doing so.
*
- * If file ownership is changed notify_change() doesn't map ia_uid and
- * ia_gid. It will asssume the caller has already provided the intended values.
- *
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported. Also, passing NULL is fine for callers holding
@@ -397,23 +391,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
* namespace of the superblock.
*/
if (ia_valid & ATTR_UID &&
- !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsuid))
return -EOVERFLOW;
if (ia_valid & ATTR_GID &&
- !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsgid))
return -EOVERFLOW;
/* Don't allow modifications of files with invalid uids or
* gids unless those uids & gids are being made valid.
*/
if (!(ia_valid & ATTR_UID) &&
- !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
+ !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)))
return -EOVERFLOW;
if (!(ia_valid & ATTR_GID) &&
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
- error = security_inode_setattr(dentry, attr);
+ error = security_inode_setattr(mnt_userns, dentry, attr);
if (error)
return error;
error = try_break_deleg(inode, delegated_inode);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 918826eaceea..d5a44fa88acf 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type;
*/
struct autofs_info {
struct dentry *dentry;
- struct inode *inode;
-
int flags;
struct completion expire_complete;
@@ -148,6 +146,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
task_pgrp(current) == sbi->oz_pgrp);
}
+static inline bool autofs_empty(struct autofs_info *ino)
+{
+ return ino->count < 2;
+}
+
struct inode *autofs_get_inode(struct super_block *, umode_t);
void autofs_free_ino(struct autofs_info *);
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b3fefd6237c3..038b3d2d9f57 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (simple_empty(dentry))
+ if (autofs_empty(ino))
return NULL;
/* Case 2: tree mount, expire iff entire tree is not busy */
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 9edf243713eb..affa70360b1f 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
INIT_LIST_HEAD(&ino->expiring);
ino->last_used = jiffies;
ino->sbi = sbi;
+ ino->count = 1;
}
return ino;
}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 91fe4548c256..ca03c1cae2be 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
#include "autofs_i.h"
+static int autofs_dir_permission(struct user_namespace *, struct inode *, int);
static int autofs_dir_symlink(struct user_namespace *, struct inode *,
struct dentry *, const char *);
static int autofs_dir_unlink(struct inode *, struct dentry *);
@@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = {
const struct inode_operations autofs_dir_inode_operations = {
.lookup = autofs_lookup,
+ .permission = autofs_dir_permission,
.unlink = autofs_dir_unlink,
.symlink = autofs_dir_symlink,
.mkdir = autofs_dir_mkdir,
@@ -77,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
@@ -93,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
* it.
*/
spin_lock(&sbi->lookup_lock);
- if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
+ if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
@@ -288,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path)
struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
- /*
- * If this is an indirect mount the dentry could have gone away
- * as a result of an expire and a new one created.
+ /* If this is an indirect mount the dentry could have gone away
+ * and a new one created.
+ *
+ * This is unusual and I can't remember the case for which it
+ * was originally added now. But an example of how this can
+ * happen is an autofs indirect mount that has the "browse"
+ * option set and also has the "symlink" option in the autofs
+ * map entry. In this case the daemon will remove the browse
+ * directory and create a symlink as the mount leaving the
+ * struct path stale.
+ *
+ * Another not so obvious case is when a mount in an autofs
+ * indirect mount that uses the "nobrowse" option is being
+ * expired at the same time as a path walk. If the mount has
+ * been umounted but the mount point directory seen before
+ * becoming unhashed (during a lockless path walk) when a stat
+ * family system call is made the mount won't be re-mounted as
+ * it should. In this case the mount point that's been removed
+ * (by the daemon) will be stale and the a new mount point
+ * dentry created.
*/
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
@@ -362,7 +382,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
* the mount never trigger mounts themselves (they have an
* autofs trigger mount mounted on them). But v4 pseudo direct
* mounts do need the leaves to trigger mounts. In this case
- * we have no choice but to use the list_empty() check and
+ * we have no choice but to use the autofs_empty() check and
* require user space behave.
*/
if (sbi->version > 4) {
@@ -371,7 +391,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
goto done;
}
} else {
- if (!simple_empty(dentry)) {
+ if (!autofs_empty(ino)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
@@ -426,9 +446,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
if (rcu_walk) {
/* We don't need fs_lock in rcu_walk mode,
- * just testing 'AUTOFS_INFO_NO_RCU' is enough.
- * simple_empty() takes a spinlock, so leave it
- * to last.
+ * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough.
+ *
* We only return -EISDIR when certain this isn't
* a mount-trap.
*/
@@ -441,9 +460,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
inode = d_inode_rcu(dentry);
if (inode && S_ISLNK(inode->i_mode))
return -EISDIR;
- if (list_empty(&dentry->d_subdirs))
- return 0;
- if (!simple_empty(dentry))
+ if (!autofs_empty(ino))
return -EISDIR;
return 0;
}
@@ -463,7 +480,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
* we can avoid needless calls ->d_automount() and avoid
* an incorrect ELOOP error return.
*/
- if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
+ if ((!path_is_mountpoint(path) && !autofs_empty(ino)) ||
(d_really_is_positive(dentry) && d_is_symlink(dentry)))
status = -EISDIR;
}
@@ -526,11 +543,30 @@ static struct dentry *autofs_lookup(struct inode *dir,
return NULL;
}
+static int autofs_dir_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ if (mask & MAY_WRITE) {
+ struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
+
+ if (!autofs_oz_mode(sbi))
+ return -EACCES;
+
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
+ return -EACCES;
+ }
+
+ return generic_permission(mnt_userns, inode, mask);
+}
+
static int autofs_dir_symlink(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
struct inode *inode;
@@ -539,16 +575,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
pr_debug("%s <- %pd\n", symname, dentry);
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
BUG_ON(!ino);
autofs_clean_ino(ino);
@@ -571,7 +597,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
d_add(dentry, inode);
dget(dentry);
- ino->count++;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
@@ -601,17 +626,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
- ino->count--;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
dput(ino->dentry);
@@ -683,16 +697,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
pr_debug("dentry %p, removing %pd\n", dentry, dentry);
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
if (ino->count != 1)
return -ENOTEMPTY;
@@ -704,7 +708,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
if (sbi->version < 5)
autofs_clear_leaf_automount_flags(dentry);
- ino->count--;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
dput(ino->dentry);
@@ -726,16 +729,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
struct autofs_info *p_ino;
struct inode *inode;
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -753,7 +746,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
autofs_set_leaf_automount_flags(dentry);
dget(dentry);
- ino->count++;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 12b8fdcc445b..9d1cde8066cf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -147,7 +147,7 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
}
static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
- struct inode *inode, struct dentry *dentry,
+ struct inode *inode, struct file *file,
umode_t mode)
{
return -EIO;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index be383fa46b12..32749fcee090 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -108,8 +108,7 @@ static const struct export_operations befs_export_operations = {
* passes it the address of befs_get_block, for mapping file
* positions to disk blocks.
*/
-static int
-befs_read_folio(struct file *file, struct folio *folio)
+static int befs_read_folio(struct file *file, struct folio *folio)
{
return block_read_full_folio(folio, befs_get_block);
}
@@ -470,13 +469,12 @@ befs_destroy_inodecache(void)
*/
static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link = page_address(page);
+ char *link = folio_address(folio);
if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
@@ -489,12 +487,12 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
goto fail;
}
link[len - 1] = '\0';
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return -EIO;
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
deleted file mode 100644
index 0dcfc691e7e2..000000000000
--- a/fs/binfmt_aout.c
+++ /dev/null
@@ -1,342 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/binfmt_aout.c
- *
- * Copyright (C) 1991, 1992, 1996 Linus Torvalds
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/coredump.h>
-#include <linux/slab.h>
-#include <linux/sched/task_stack.h>
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-
-static int load_aout_binary(struct linux_binprm *);
-static int load_aout_library(struct file*);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
-};
-
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end > start)
- return vm_brk(start, end - start);
- return 0;
-}
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
-{
- char __user * __user *argv;
- char __user * __user *envp;
- unsigned long __user *sp;
- int argc = bprm->argc;
- int envc = bprm->envc;
-
- sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __alpha__
-/* whee.. test-programs are so much fun. */
- put_user(0, --sp);
- put_user(0, --sp);
- if (bprm->loader) {
- put_user(0, --sp);
- put_user(1003, --sp);
- put_user(bprm->loader, --sp);
- put_user(1002, --sp);
- }
- put_user(bprm->exec, --sp);
- put_user(1001, --sp);
-#endif
- sp -= envc+1;
- envp = (char __user * __user *) sp;
- sp -= argc+1;
- argv = (char __user * __user *) sp;
-#ifndef __alpha__
- put_user((unsigned long) envp,--sp);
- put_user((unsigned long) argv,--sp);
-#endif
- put_user(argc,--sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-->0) {
- char c;
- put_user(p,argv++);
- do {
- get_user(c,p++);
- } while (c);
- }
- put_user(NULL,argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-->0) {
- char c;
- put_user(p,envp++);
- do {
- get_user(c,p++);
- } while (c);
- }
- put_user(NULL,envp);
- current->mm->env_end = (unsigned long) p;
- return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-
-static int load_aout_binary(struct linux_binprm * bprm)
-{
- struct pt_regs *regs = current_pt_regs();
- struct exec ex;
- unsigned long error;
- unsigned long fd_offset;
- unsigned long rlim;
- int retval;
-
- ex = *((struct exec *) bprm->buf); /* exec-header */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
- N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
- N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
- }
-
- /*
- * Requires a mmap handler. This prevents people from using a.out
- * as part of an exploit attack against /proc-related vulnerabilities.
- */
- if (!bprm->file->f_op->mmap)
- return -ENOEXEC;
-
- fd_offset = N_TXTOFF(ex);
-
- /* Check initial limits. This avoids letting people circumvent
- * size limits imposed on them by creating programs with large
- * arrays in the data or bss.
- */
- rlim = rlimit(RLIMIT_DATA);
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (ex.a_data + ex.a_bss > rlim)
- return -ENOMEM;
-
- /* Flush all traces of the currently running executable */
- retval = begin_new_exec(bprm);
- if (retval)
- return retval;
-
- /* OK, This is the point of no return */
-#ifdef __alpha__
- SET_AOUT_PERSONALITY(bprm, ex);
-#else
- set_personality(PER_LINUX);
-#endif
- setup_new_exec(bprm);
-
- current->mm->end_code = ex.a_text +
- (current->mm->start_code = N_TXTADDR(ex));
- current->mm->end_data = ex.a_data +
- (current->mm->start_data = N_DATADDR(ex));
- current->mm->brk = ex.a_bss +
- (current->mm->start_brk = N_BSSADDR(ex));
-
- retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0)
- return retval;
-
-
- if (N_MAGIC(ex) == OMAGIC) {
- unsigned long text_addr, map_size;
- loff_t pos;
-
- text_addr = N_TXTADDR(ex);
-
-#ifdef __alpha__
- pos = fd_offset;
- map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
-#else
- pos = 32;
- map_size = ex.a_text+ex.a_data;
-#endif
- error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error)
- return error;
-
- error = read_code(bprm->file, text_addr, pos,
- ex.a_text+ex.a_data);
- if ((signed long)error < 0)
- return error;
- } else {
- if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
- (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
- {
- printk(KERN_NOTICE "executable not page aligned\n");
- }
-
- if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
- {
- printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert program: %pD\n",
- bprm->file);
- }
-
- if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
- error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- if (error)
- return error;
-
- read_code(bprm->file, N_TXTADDR(ex), fd_offset,
- ex.a_text + ex.a_data);
- goto beyond_if;
- }
-
- error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
- PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE,
- fd_offset);
-
- if (error != N_TXTADDR(ex))
- return error;
-
- error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE,
- fd_offset + ex.a_text);
- if (error != N_DATADDR(ex))
- return error;
- }
-beyond_if:
- set_binfmt(&aout_format);
-
- retval = set_brk(current->mm->start_brk, current->mm->brk);
- if (retval < 0)
- return retval;
-
- current->mm->start_stack =
- (unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
-#ifdef __alpha__
- regs->gp = ex.a_gpvalue;
-#endif
- finalize_exec(bprm);
- start_thread(regs, ex.a_entry, current->mm->start_stack);
- return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
- struct inode * inode;
- unsigned long bss, start_addr, len;
- unsigned long error;
- int retval;
- struct exec ex;
- loff_t pos = 0;
-
- inode = file_inode(file);
-
- retval = -ENOEXEC;
- error = kernel_read(file, &ex, sizeof(ex), &pos);
- if (error != sizeof(ex))
- goto out;
-
- /* We come in here for the regular a.out style of shared libraries */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
- N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
- i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- goto out;
- }
-
- /*
- * Requires a mmap handler. This prevents people from using a.out
- * as part of an exploit attack against /proc-related vulnerabilities.
- */
- if (!file->f_op->mmap)
- goto out;
-
- if (N_FLAGS(ex))
- goto out;
-
- /* For QMAGIC, the starting address is 0x20 into the page. We mask
- this off to get the starting address for the page */
-
- start_addr = ex.a_entry & 0xfffff000;
-
- if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
- if (printk_ratelimit())
- {
- printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert library: %pD\n",
- file);
- }
- retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- if (retval)
- goto out;
-
- read_code(file, start_addr, N_TXTOFF(ex),
- ex.a_text + ex.a_data);
- retval = 0;
- goto out;
- }
- /* Now use mmap to map the library into memory. */
- error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE,
- N_TXTOFF(ex));
- retval = error;
- if (error != start_addr)
- goto out;
-
- len = PAGE_ALIGN(ex.a_text + ex.a_data);
- bss = ex.a_text + ex.a_data + ex.a_bss;
- if (bss > len) {
- retval = vm_brk(start_addr + len, bss - len);
- if (retval)
- goto out;
- }
- retval = 0;
-out:
- return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
- register_binfmt(&aout_format);
- return 0;
-}
-
-static void __exit exit_aout_binfmt(void)
-{
- unregister_binfmt(&aout_format);
-}
-
-core_initcall(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 63c7ebb0da89..6a11025e5850 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -911,7 +911,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
if (!interp_elf_ex) {
retval = -ENOMEM;
- goto out_free_ph;
+ goto out_free_file;
}
/* Get the exec headers */
@@ -1354,6 +1354,7 @@ out:
out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
+out_free_file:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 99f9995670ea..fa9ddcc9eb0b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o
+ subpage.o tree-mod-log.o extent-io-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 07960529b360..6e2596ddae10 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,7 +13,6 @@ struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ebc392ea1d74..18374a6d05bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -138,6 +138,7 @@ struct share_check {
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -288,8 +289,10 @@ static void prelim_release(struct preftree *preftree)
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -647,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -761,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -820,16 +839,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -855,10 +869,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -884,13 +904,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -920,7 +949,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -1023,7 +1052,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1033,6 +1063,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -1122,7 +1153,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1354,6 +1386,12 @@ again:
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1379,6 +1417,14 @@ again:
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1395,24 +1441,6 @@ out:
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -1511,16 +1539,137 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
-/**
- * Check if an extent is shared or not
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (!cache->use_cache)
+ return false;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &cache->entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+ /*
+ * If the node at this level is shared, than all nodes below are also
+ * shared. Currently some of the nodes below may be marked as not shared
+ * because we have just switched from one leaf to another, and switched
+ * also other nodes above the leaf and below the current level, so mark
+ * them as shared.
+ */
+ if (*is_shared) {
+ for (int i = 0; i < level; i++) {
+ cache->entries[i].is_shared = true;
+ cache->entries[i].gen = entry->gen;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (!cache->use_cache)
+ return;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &cache->entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &cache->entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
+ * Check if a data extent is shared or not.
*
- * @root: root inode belongs to
- * @inum: inode number of the inode whose extent we are checking
- * @bytenr: logical bytenr of the extent we are checking
- * @roots: list of roots this extent is shared among
- * @tmp: temporary list used for iteration
+ * @root: The root the inode belongs to.
+ * @inum: Number of the inode whose extent we are checking.
+ * @bytenr: Logical bytenr of the extent we are checking.
+ * @extent_gen: Generation of the extent (file extent item) or 0 if it is
+ * not known.
+ * @roots: List of roots this extent is shared among.
+ * @tmp: Temporary list used for iteration.
+ * @cache: A backref lookup result cache.
*
- * btrfs_check_shared uses the backref walking code but will short
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
* one passed in. This provides a significant performance benefit for
* callers (such as fiemap) which want to know whether the extent is
@@ -1531,8 +1680,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp)
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -1544,7 +1695,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
+ int level;
ulist_init(roots);
ulist_init(tmp);
@@ -1561,23 +1714,73 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
btrfs_get_tree_mod_seq(fs_info, &elem);
}
+ /* -1 means we are in the bytenr of the data extent. */
+ level = -1;
ULIST_ITER_INIT(&uiter);
+ cache->use_cache = true;
while (1) {
+ bool is_shared;
+ bool cached;
+
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
roots, NULL, &shared, false);
if (ret == BACKREF_FOUND_SHARED) {
/* this is the only condition under which we return 1 */
ret = 1;
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, true);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
+ /*
+ * If our data extent is not shared through reflinks and it was
+ * created in a generation after the last one used to create a
+ * snapshot of the inode's root, then it can not be shared
+ * indirectly through subtrees, as that can only happen with
+ * snapshots. In this case bail out, no need to check for the
+ * sharedness of extent buffers.
+ */
+ if (level == -1 &&
+ extent_gen > btrfs_root_last_snapshot(&root->root_item))
+ break;
+
+ /*
+ * If our data extent was not directly shared (without multiple
+ * reference items), than it might have a single reference item
+ * with a count > 1 for the same offset, which means there are 2
+ * (or more) file extent items that point to the data extent -
+ * this happens when a file extent item needs to be split and
+ * then one item gets moved to another leaf due to a b+tree leaf
+ * split when inserting some item. In this case the file extent
+ * items may be located in different leaves and therefore some
+ * of the leaves may be referenced through shared subtrees while
+ * others are not. Since our extent buffer cache only works for
+ * a single path (by far the most common case and simpler to
+ * deal with), we can not use it if we have multiple leaves
+ * (which implies multiple paths).
+ */
+ if (level == -1 && tmp->nnodes > 1)
+ cache->use_cache = false;
+
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, false);
node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
+ level++;
+ cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+ &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
@@ -2028,10 +2231,29 @@ out:
return ret;
}
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+ void *ctx, bool ignore_offset)
{
int ret;
u64 extent_item_pos;
@@ -2049,17 +2271,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, search_commit_root,
- iterate, ctx, ignore_offset);
+ build_ino_list, ctx, ignore_offset);
return ret;
}
-typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx);
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath);
-static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
{
int ret = 0;
int slot;
@@ -2068,6 +2288,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u32 name_len;
u64 parent = 0;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2103,8 +2325,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
fs_root->root_key.objectid);
- ret = iterate(parent, name_len,
- (unsigned long)(iref + 1), eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)(iref + 1), eb, ipath);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -2118,15 +2340,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath)
{
int ret;
int slot;
u64 offset = 0;
u64 parent;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
u32 item_size;
@@ -2162,8 +2384,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
name_len = btrfs_inode_extref_name_len(eb, extref);
- ret = iterate(parent, name_len,
- (unsigned long)&extref->name, eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)&extref->name, eb, ipath);
if (ret)
break;
@@ -2180,34 +2402,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path, iterate_irefs_t *iterate,
- void *ctx)
-{
- int ret;
- int found_refs = 0;
-
- ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
- if (!ret)
- ++found_refs;
- else if (ret != -ENOENT)
- return ret;
-
- ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
- if (ret == -ENOENT && found_refs)
- return 0;
-
- return ret;
-}
-
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx)
+ struct extent_buffer *eb, struct inode_fs_paths *ipath)
{
- struct inode_fs_paths *ipath = ctx;
char *fspath;
char *fspath_min;
int i = ipath->fspath->elem_cnt;
@@ -2248,8 +2449,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
*/
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
- return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, ipath);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, ipath);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ba454032dbe2..8e69584d538d 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -17,6 +17,21 @@ struct inode_fs_paths {
struct btrfs_data_container *fspath;
};
+struct btrfs_backref_shared_cache_entry {
+ u64 bytenr;
+ u64 gen;
+ bool is_shared;
+};
+
+struct btrfs_backref_shared_cache {
+ /*
+ * A path from a root to a leaf that has a file extent item pointing to
+ * a given data extent should never exceed the maximum b+tree height.
+ */
+ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+ bool use_cache;
+};
+
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
@@ -35,8 +50,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
+ struct btrfs_path *path, void *ctx,
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
@@ -63,8 +77,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp_ulist);
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ede389f2602d..deebc8ddbd93 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -440,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
-static bool space_cache_v1_done(struct btrfs_block_group *cache)
-{
- bool ret;
-
- spin_lock(&cache->lock);
- ret = cache->cached != BTRFS_CACHE_FAST;
- spin_unlock(&cache->lock);
-
- return ret;
-}
-
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl)
-{
- wait_event(caching_ctl->wait, space_cache_v1_done(cache));
-}
-
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -606,8 +593,6 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -631,9 +616,6 @@ next:
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -668,7 +650,6 @@ next:
total_found += add_new_free_space(block_group, last,
block_group->start + block_group->length);
- caching_ctl->progress = (u64)-1;
out:
btrfs_free_path(path);
@@ -738,8 +719,6 @@ done:
}
#endif
- caching_ctl->progress = (u64)-1;
-
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
@@ -750,9 +729,8 @@ done:
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
@@ -769,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
@@ -785,11 +762,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- if (btrfs_test_opt(fs_info, SPACE_CACHE))
- cache->cached = BTRFS_CACHE_FAST;
- else
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
+ cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
write_lock(&fs_info->block_group_cache_lock);
@@ -801,8 +774,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
- if (load_cache_only && caching_ctl)
- btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
if (caching_ctl)
btrfs_put_caching_control(caching_ctl);
@@ -1005,32 +978,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- write_lock(&fs_info->block_group_cache_lock);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- write_unlock(&fs_info->block_group_cache_lock);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
}
}
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1051,8 +1023,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
+ WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags) &&
+ block_group->space_info->active_total_bytes
+ < block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ block_group->space_info->active_total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
block_group->space_info->bytes_zone_unusable -=
@@ -1081,7 +1059,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
- block_group->removed = 1;
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
/*
* At this point trimming or scrub can't start on this block group,
* because we removed the block group from the rbtree
@@ -1316,6 +1295,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
/*
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
@@ -1555,6 +1537,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
if (!btrfs_should_reclaim(fs_info))
return;
@@ -1635,9 +1620,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret)
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
+ }
next:
btrfs_put_block_group(bg);
@@ -1816,11 +1803,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
}
/*
* The remaining case would be for RAID56, multiply by
@@ -1901,16 +1887,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
-
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-}
-
static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start)
{
@@ -1948,7 +1924,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+ cache->full_stripe_locks_root.root = RB_ROOT;
+ mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
@@ -2013,7 +1990,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
int need_clear)
{
struct btrfs_block_group *cache;
- struct btrfs_space_info *space_info;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
int ret;
@@ -2089,11 +2065,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
/* Should not have any excluded extents. Just in case, though. */
btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, cache->start,
cache->start + cache->length);
@@ -2106,13 +2080,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super,
- cache->zone_unusable, &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
+ btrfs_add_bg_to_space_info(info, cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_writeable(info, cache->start)) {
@@ -2136,7 +2104,6 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct btrfs_space_info *space_info;
struct rb_node *node;
int ret = 0;
@@ -2156,7 +2123,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
/* Fill dummy cache as FULL */
bg->length = em->len;
bg->flags = map->type;
- bg->last_byte_to_unpin = (u64)-1;
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = em->len;
bg->flags = map->type;
@@ -2177,10 +2143,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, 0, &space_info);
- bg->space_info = space_info;
- link_block_group(bg);
+ btrfs_add_bg_to_space_info(fs_info, bg);
set_avail_alloc_bits(fs_info, bg->flags);
}
@@ -2200,7 +2163,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!root)
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2435,7 +2407,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- if (!block_group->chunk_item_inserted) {
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2504,7 +2477,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2529,14 +2501,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
btrfs_free_excluded_extents(cache);
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
- }
-#endif
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
@@ -2557,12 +2521,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, cache->zone_unusable,
- &cache->space_info);
+ btrfs_add_bg_to_space_info(fs_info, cache);
btrfs_update_global_block_rsv(fs_info);
- link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ cache->space_info->bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
trans->delayed_ref_updates++;
@@ -2659,6 +2628,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
+ /*
+ * We have allocated a new chunk. We also need to activate that chunk to
+ * grant metadata tickets for zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ if (ret < 0)
+ goto out;
+
ret = inc_block_group_ro(cache, 0);
if (ret == -ETXTBSY)
goto unlock_out;
@@ -2871,7 +2848,7 @@ again:
cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
- cache_size);
+ cache_size, false);
if (ret)
goto out_put;
@@ -3297,7 +3274,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, true);
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
@@ -3761,6 +3738,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
@@ -3884,6 +3862,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
ret = PTR_ERR(bg);
} else {
/*
+ * We have a new chunk. We also need to activate it for
+ * zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ if (ret < 0)
+ return;
+
+ /*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
@@ -3958,35 +3944,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
- u64 last = 0;
- while (1) {
- struct inode *inode;
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct inode *inode = block_group->inode;
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- btrfs_wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
+ block_group->inode = NULL;
spin_unlock(&block_group->lock);
- block_group = btrfs_next_block_group(block_group);
- }
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->start + block_group->length;
- btrfs_put_block_group(block_group);
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ block_group = btrfs_next_block_group(block_group);
}
}
@@ -4122,7 +4097,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_lock(&block_group->lock);
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
- block_group->removed);
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
spin_unlock(&block_group->lock);
if (cleanup) {
@@ -4143,7 +4118,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
* tasks trimming this block group have left 1 entry each one.
* Free them if any.
*/
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ btrfs_remove_free_space_cache(block_group);
}
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ac668ace50a..8fb14b99a1d1 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -46,19 +46,44 @@ enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+ BLOCK_GROUP_FLAG_IREF,
+ BLOCK_GROUP_FLAG_REMOVED,
+ BLOCK_GROUP_FLAG_TO_COPY,
+ BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+ BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
+};
+
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
- u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
@@ -95,22 +120,15 @@ struct btrfs_block_group {
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
+ unsigned long runtime_flags;
unsigned int ro;
- unsigned int iref:1;
- unsigned int has_caching_ctl:1;
- unsigned int removed:1;
- unsigned int to_copy:1;
- unsigned int relocating_repair:1;
- unsigned int chunk_item_inserted:1;
- unsigned int zone_is_active:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
- u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -262,9 +280,7 @@ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
-int btrfs_cache_block_group(struct btrfs_block_group *cache,
- int load_cache_only);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
@@ -306,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index b3ee49b0b1e8..ec96285357e0 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
block_rsv->reserved = block_rsv->size;
- block_rsv->full = 1;
+ block_rsv->full = true;
} else {
num_bytes = 0;
}
@@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
bytes_to_add = min(num_bytes, bytes_to_add);
dest->reserved += bytes_to_add;
if (dest->reserved >= dest->size)
- dest->full = 1;
+ dest->full = true;
num_bytes -= bytes_to_add;
}
spin_unlock(&dest->lock);
@@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
@@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
btrfs_init_block_rsv(rsv, type);
rsv->space_info = btrfs_find_space_info(fs_info,
@@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
struct btrfs_block_rsv *block_rsv;
@@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
- else if (block_rsv != global_rsv && !delayed_rsv->full)
+ else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
@@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
if (block_rsv->reserved >= num_bytes) {
block_rsv->reserved -= num_bytes;
if (block_rsv->reserved < block_rsv->size)
- block_rsv->full = 0;
+ block_rsv->full = false;
ret = 0;
}
spin_unlock(&block_rsv->lock);
@@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
if (update_size)
block_rsv->size += num_bytes;
else if (block_rsv->reserved >= block_rsv->size)
- block_rsv->full = 1;
+ block_rsv->full = true;
spin_unlock(&block_rsv->lock);
}
@@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
+ global_rsv->full = false;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
@@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
btrfs_try_granting_tickets(fs_info, sinfo);
}
- if (block_rsv->reserved == block_rsv->size)
- block_rsv->full = 1;
- else
- block_rsv->full = 0;
+ block_rsv->full = (block_rsv->reserved == block_rsv->size);
if (block_rsv->size >= sinfo->total_bytes)
sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -427,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 3b67ff08d434..578c3497a455 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum;
/*
* Types of block reserves
*/
-enum {
+enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_GLOBAL,
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
@@ -25,9 +25,10 @@ struct btrfs_block_rsv {
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned short full;
- unsigned short type;
- unsigned short failfast;
+ bool full;
+ bool failfast;
+ /* Block reserve type, one of BTRFS_BLOCK_RSV_* */
+ enum btrfs_rsv_type type:8;
/*
* Qgroup equivalent for @size @reserved
@@ -49,13 +50,13 @@ struct btrfs_block_rsv {
u64 qgroup_rsv_reserved;
};
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type);
void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
@@ -91,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+ return data_race(rsv->full);
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 33811e896623..54c2ccb36b61 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -65,6 +65,8 @@ enum {
* on the same file.
*/
BTRFS_INODE_VERITY_IN_PROGRESS,
+ /* Set when this inode is a free space inode. */
+ BTRFS_INODE_FREE_SPACE_INODE,
};
/* in memory btrfs inode */
@@ -94,7 +96,8 @@ struct btrfs_inode {
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
- struct extent_io_tree io_failure_tree;
+ struct rb_root io_failure_tree;
+ spinlock_t io_failure_lock;
/*
* Keep track of where the inode has extent items mapped in order to
@@ -250,11 +253,6 @@ struct btrfs_inode {
struct inode vfs_inode;
};
-static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
-{
- return inode->root->fs_info->sectorsize;
-}
-
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -272,26 +270,31 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
-static inline void btrfs_insert_inode_hash(struct inode *inode)
-{
- unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
-
- __insert_inode_hash(inode, h);
-}
+#if BITS_PER_LONG == 32
+/*
+ * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
+ * we use the inode's location objectid which is a u64 to avoid truncation.
+ */
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
u64 ino = inode->location.objectid;
- /*
- * !ino: btree_inode
- * type == BTRFS_ROOT_ITEM_KEY: subvol dir
- */
- if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
+ if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
ino = inode->vfs_inode.i_ino;
return ino;
}
+#else
+
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
+{
+ return inode->vfs_inode.i_ino;
+}
+
+#endif
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
@@ -300,14 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
-
- if (root == root->fs_info->tree_root &&
- btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
- return true;
- if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
- return true;
- return false;
+ return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
static inline bool is_data_inode(struct inode *inode)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5d20137b7b67..98c6e5feab19 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -152,7 +152,7 @@ struct btrfsic_block {
struct btrfsic_block *next_in_same_bio;
void *orig_bio_private;
bio_end_io_t *orig_bio_end_io;
- int submit_bio_bh_rw;
+ blk_opf_t submit_bio_bh_rw;
u64 flush_gen; /* only valid if !never_written */
};
@@ -1681,7 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- int submit_bio_bh_rw)
+ blk_opf_t submit_bio_bh_rw)
{
int is_metadata;
struct btrfsic_block *block;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f4564f32f6d9..e6635fe70067 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -8,6 +8,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/time.h>
@@ -15,6 +16,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
+#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
@@ -136,109 +138,14 @@ static int compression_decompress(int type, struct list_head *ws,
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
- unsigned long disk_size)
-{
- return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
-}
-
-static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
- u64 disk_start)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- const u32 csum_size = fs_info->csum_size;
- const u32 sectorsize = fs_info->sectorsize;
- struct page *page;
- unsigned int i;
- char *kaddr;
- u8 csum[BTRFS_CSUM_SIZE];
- struct compressed_bio *cb = bio->bi_private;
- u8 *cb_sum = cb->sums;
-
- if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
- return 0;
-
- shash->tfm = fs_info->csum_shash;
-
- for (i = 0; i < cb->nr_pages; i++) {
- u32 pg_offset;
- u32 bytes_left = PAGE_SIZE;
- page = cb->compressed_pages[i];
-
- /* Determine the remaining bytes inside the page first */
- if (i == cb->nr_pages - 1)
- bytes_left = cb->compressed_len - i * PAGE_SIZE;
-
- /* Hash through the page sector by sector */
- for (pg_offset = 0; pg_offset < bytes_left;
- pg_offset += sectorsize) {
- kaddr = kmap_atomic(page);
- crypto_shash_digest(shash, kaddr + pg_offset,
- sectorsize, csum);
- kunmap_atomic(kaddr);
-
- if (memcmp(&csum, cb_sum, csum_size) != 0) {
- btrfs_print_data_csum_error(inode, disk_start,
- csum, cb_sum, cb->mirror_num);
- if (btrfs_bio(bio)->device)
- btrfs_dev_stat_inc_and_print(
- btrfs_bio(bio)->device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- return -EIO;
- }
- cb_sum += csum_size;
- disk_start += sectorsize;
- }
- }
- return 0;
-}
-
-/*
- * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
- *
- * Return true if there is no pending bio nor io.
- * Return false otherwise.
- */
-static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- unsigned int bi_size = 0;
- bool last_io = false;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * At endio time, bi_iter.bi_size doesn't represent the real bio size.
- * Thus here we have to iterate through all segments to grab correct
- * bio size.
- */
- bio_for_each_segment_all(bvec, bio, iter_all)
- bi_size += bvec->bv_len;
-
- if (bio->bi_status)
- cb->status = bio->bi_status;
-
- ASSERT(bi_size && bi_size <= cb->compressed_len);
- last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
- &cb->pending_sectors);
- /*
- * Here we must wake up the possible error handler after all other
- * operations on @cb finished, or we can race with
- * finish_compressed_bio_*() which may free @cb.
- */
- wake_up_var(cb);
-
- return last_io;
-}
-
static void finish_compressed_bio_read(struct compressed_bio *cb)
{
unsigned int index;
struct page *page;
+ if (cb->status == BLK_STS_OK)
+ cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+
/* Release the compressed pages */
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
@@ -247,86 +154,60 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
}
/* Do io completion on the original bio */
- if (cb->status != BLK_STS_OK) {
- cb->orig_bio->bi_status = cb->status;
- bio_endio(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * We have verified the checksum already, set page checked so
- * the end_io handlers know about it
- */
- ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
- u64 bvec_start = page_offset(bvec->bv_page) +
- bvec->bv_offset;
-
- btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
- bvec->bv_page, bvec_start,
- bvec->bv_len);
- }
-
- bio_endio(cb->orig_bio);
- }
+ btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
/* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
}
-/* when we finish reading compressed pages from the disk, we
- * decompress them and then run the bio end_io routines on the
- * decompressed pages (in the inode address space).
- *
- * This allows the checksumming and other IO error handling routines
- * to work normally
- *
- * The compressed pages are freed here, and it must be run
- * in process context
+/*
+ * Verify the checksums and kick off repair if needed on the uncompressed data
+ * before decompressing it into the original bio and freeing the uncompressed
+ * pages.
*/
-static void end_compressed_bio_read(struct bio *bio)
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- unsigned int mirror = btrfs_bio(bio)->mirror_num;
- int ret = 0;
-
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
-
- /*
- * Record the correct mirror_num in cb->orig_bio so that
- * read-repair can work properly.
- */
- btrfs_bio(cb->orig_bio)->mirror_num = mirror;
- cb->mirror_num = mirror;
-
- /*
- * Some IO in this cb have failed, just skip checksum as there
- * is no way it could be correct.
- */
- if (cb->status != BLK_STS_OK)
- goto csum_failed;
-
- inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), bio,
- bio->bi_iter.bi_sector << 9);
- if (ret)
- goto csum_failed;
+ struct compressed_bio *cb = bbio->private;
+ struct inode *inode = cb->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ blk_status_t status = bbio->bio.bi_status;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (!status &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
+ btrfs_clean_io_failure(bi, start, bv.bv_page,
+ bv.bv_offset);
+ } else {
+ int ret;
+
+ refcount_inc(&cb->pending_ios);
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ btrfs_submit_data_read_bio);
+ if (ret) {
+ refcount_dec(&cb->pending_ios);
+ status = errno_to_blk_status(ret);
+ }
+ }
+ }
- /* ok, we're the last bio for this extent, lets start
- * the decompression.
- */
- ret = btrfs_decompress_bio(cb);
+ if (status)
+ cb->status = status;
-csum_failed:
- if (ret)
- cb->status = errno_to_blk_status(ret);
- finish_compressed_bio_read(cb);
-out:
- bio_put(bio);
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ btrfs_bio_free_csum(bbio);
+ bio_put(&bbio->bio);
}
/*
@@ -339,8 +220,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
- struct page *pages[16];
- unsigned long nr_pages = end_index - index + 1;
+ struct folio_batch fbatch;
const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
@@ -348,24 +228,23 @@ static noinline void end_compressed_writeback(struct inode *inode,
if (errno)
mapping_set_error(inode->i_mapping, errno);
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- nr_pages -= 1;
- index += 1;
- continue;
- }
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ ret = filemap_get_folios(inode->i_mapping, &index, end_index,
+ &fbatch);
+
+ if (ret == 0)
+ return;
+
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (errno)
- SetPageError(pages[i]);
- btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ folio_set_error(folio);
+ btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
cb->start, cb->len);
- put_page(pages[i]);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
}
/* the inode may be gone now */
}
@@ -403,6 +282,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
kfree(cb);
}
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -410,31 +297,20 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio)
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
-
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
+ struct compressed_bio *cb = bbio->private;
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
- finish_compressed_bio_write(cb);
-out:
- bio_put(bio);
-}
+ if (refcount_dec_and_test(&cb->pending_ios)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct bio *bio, int mirror_num)
-{
- blk_status_t ret;
-
- ASSERT(bio->bi_iter.bi_size);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- return ret;
+ btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ }
+ bio_put(&bbio->bio);
}
/*
@@ -455,7 +331,8 @@ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
- unsigned int opf, bio_end_io_t endio_func,
+ blk_opf_t opf,
+ btrfs_bio_end_io_t endio_func,
u64 *next_stripe_start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
@@ -464,12 +341,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
-
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bio->bi_opf = opf;
- bio->bi_private = cb;
- bio->bi_end_io = endio_func;
em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
if (IS_ERR(em)) {
@@ -487,7 +360,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
return ERR_PTR(ret);
}
*next_stripe_start = disk_bytenr + geom.len;
-
+ refcount_inc(&cb->pending_ios);
return bio;
}
@@ -505,7 +378,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
- unsigned int write_flags,
+ blk_opf_t write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback)
{
@@ -514,26 +387,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct compressed_bio *cb;
u64 cur_disk_bytenr = disk_start;
u64 next_stripe_start;
- blk_status_t ret;
+ blk_status_t ret = BLK_STS_OK;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
- const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+ const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
- cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->writeback = writeback;
- cb->orig_bio = NULL;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
if (blkcg_css)
@@ -554,8 +426,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
&next_stripe_start);
if (IS_ERR(bio)) {
ret = errno_to_blk_status(PTR_ERR(bio));
- bio = NULL;
- goto finish_cb;
+ break;
}
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
@@ -599,44 +470,24 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (submit) {
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, true);
- if (ret)
- goto finish_cb;
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ break;
+ }
}
- ret = submit_compressed_bio(fs_info, bio, 0);
- if (ret)
- goto finish_cb;
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, bio, 0);
bio = NULL;
}
cond_resched();
}
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
-
- return 0;
-finish_cb:
if (blkcg_css)
kthread_associate_blkcg(NULL);
- if (bio) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
- /* Last byte of @cb is submitted, endio will free @cb */
- if (cur_disk_bytenr == disk_start + compressed_len)
- return ret;
-
- wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
- (disk_start + compressed_len - cur_disk_bytenr) >>
- fs_info->sectorsize_bits);
- /*
- * Even with previous bio ended, we should still have io not yet
- * submitted, thus need to finish manually.
- */
- ASSERT(refcount_read(&cb->pending_sectors));
- /* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_write(cb);
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_write(cb);
return ret;
}
@@ -660,7 +511,8 @@ static u64 bio_end_offset(struct bio *bio)
*/
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
- struct compressed_bio *cb)
+ struct compressed_bio *cb,
+ int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
@@ -729,6 +581,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
+ if (!*memstall && PageWorkingset(page)) {
+ psi_memstall_enter(pflags);
+ *memstall = 1;
+ }
+
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -737,7 +594,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
- lock_extent(tree, cur, page_end);
+ lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
@@ -751,7 +608,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -765,14 +622,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int zeros;
zeros = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, zeros);
- flush_dcache_page(page);
}
}
add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -816,10 +672,11 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
+ unsigned long pflags;
+ int memstall = 0;
blk_status_t ret;
int ret2;
int i;
- u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -837,17 +694,15 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb) {
ret = BLK_STS_RESOURCE;
goto out;
}
- refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = inode;
- cb->mirror_num = mirror_num;
- sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
@@ -874,7 +729,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto fail;
}
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
@@ -893,9 +748,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
REQ_OP_READ, end_compressed_bio_read,
&next_stripe_start);
if (IS_ERR(comp_bio)) {
- ret = errno_to_blk_status(PTR_ERR(comp_bio));
- comp_bio = NULL;
- goto finish_cb;
+ cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
+ break;
}
}
/*
@@ -931,22 +785,35 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
submit = true;
if (submit) {
- unsigned int nr_sectors;
+ /* Save the original iter for read repair */
+ if (bio_op(comp_bio) == REQ_OP_READ)
+ btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- if (ret)
- goto finish_cb;
+ /*
+ * Save the initial offset of this chunk, as there
+ * is no direct correlation between compressed pages and
+ * the original file offset. The field is only used for
+ * priting error messages.
+ */
+ btrfs_bio(comp_bio)->file_offset = file_offset;
- nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
- fs_info->sectorsize);
- sums += fs_info->csum_size * nr_sectors;
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
+ break;
+ }
- ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
- if (ret)
- goto finish_cb;
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, comp_bio, mirror_num);
comp_bio = NULL;
}
}
+
+ if (memstall)
+ psi_memstall_leave(&pflags);
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
return;
fail:
@@ -961,28 +828,8 @@ fail:
kfree(cb);
out:
free_extent_map(em);
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
-finish_cb:
- if (comp_bio) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
- /* All bytes of @cb is submitted, endio will free @cb */
- if (cur_disk_byte == disk_bytenr + compressed_len)
- return;
-
- wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
- (disk_bytenr + compressed_len - cur_disk_byte) >>
- fs_info->sectorsize_bits);
- /*
- * Even with previous bio ended, we should still have io not yet
- * submitted, thus need to finish @cb manually.
- */
- ASSERT(refcount_read(&cb->pending_sectors));
- /* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_read(cb);
}
/*
@@ -1481,7 +1328,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
ASSERT(copy_start - decompressed < buf_len);
memcpy_to_page(bvec.bv_page, bvec.bv_offset,
buf + copy_start - decompressed, copy_len);
- flush_dcache_page(bvec.bv_page);
cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 2707404389a5..1aa02903de69 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
- refcount_t pending_sectors;
+ /* Number of outstanding bios */
+ refcount_t pending_ios;
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -59,16 +59,12 @@ struct compressed_bio {
/* IO errors */
blk_status_t status;
- int mirror_num;
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
-
- /*
- * the start of a variable length array of checksums only
- * used by reads
- */
- u8 sums[];
+ union {
+ /* For reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -99,7 +95,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
- unsigned int write_flags,
+ blk_opf_t write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback);
void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6e556031a8f3..a9543f01184c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -114,6 +114,22 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -1447,6 +1463,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
if (unlock_up)
btrfs_unlock_up_safe(p, level + 1);
@@ -1467,6 +1488,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
}
if (unlock_up) {
@@ -1634,7 +1657,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -1910,6 +1939,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
if (ins_len < 0) {
lowest_unlock = 2;
@@ -1936,7 +1972,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (p->need_commit_sem) {
ASSERT(p->search_commit_root);
- down_read(&fs_info->commit_root_sem);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
again:
@@ -2075,11 +2116,22 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
+
+ btrfs_maybe_reset_lockdep_class(root, b);
+
if (level <= write_lock_level) {
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- btrfs_tree_read_lock(b);
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(b);
+ }
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
@@ -2128,6 +2180,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
@@ -4429,6 +4482,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
+ ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -4609,6 +4663,8 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
+ ASSERT(!path->nowait);
+
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0e49b1a0c071..9e6d48ff4597 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -42,7 +42,6 @@ struct btrfs_delayed_ref_root;
struct btrfs_space_info;
struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
@@ -50,6 +49,11 @@ struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
struct btrfs_ioctl_encoded_io_args;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -107,14 +111,6 @@ struct btrfs_ioctl_encoded_io_args;
#define BTRFS_STAT_CURR 0
#define BTRFS_STAT_PREV 1
-/*
- * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
- */
-static inline u32 count_max_extents(u64 size)
-{
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-}
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -230,6 +226,13 @@ struct btrfs_root_backup {
#define BTRFS_SUPER_INFO_SIZE 4096
/*
+ * The reserved space at the beginning of each device.
+ * It covers the primary super block and leaves space for potential use by other
+ * tools like bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
@@ -248,8 +251,12 @@ struct btrfs_super_block {
__le64 chunk_root;
__le64 log_root;
- /* this will help find the new super based on the log root */
- __le64 log_root_transid;
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
__le64 total_bytes;
__le64 bytes_used;
__le64 root_dir_objectid;
@@ -277,14 +284,9 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
- /* Extent tree v2 */
- __le64 block_group_root;
- __le64 block_group_root_generation;
- u8 block_group_root_level;
-
/* future expansion */
- u8 reserved8[7];
- __le64 reserved[25];
+ u8 reserved8[8];
+ __le64 reserved[27];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -304,7 +306,8 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
- BTRFS_FEATURE_COMPAT_RO_VERITY)
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -440,9 +443,10 @@ struct btrfs_path {
* header (ie. sizeof(struct btrfs_item) is not included).
*/
unsigned int search_for_extension:1;
+ /* Stop search if any locks need to be taken (for read) */
+ unsigned int nowait:1;
};
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
- sizeof(struct btrfs_item))
+
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
time64_t time_started; /* seconds since 1-Jan-1970 */
@@ -499,22 +503,6 @@ struct btrfs_free_cluster {
struct list_head block_group_list;
};
-enum btrfs_caching_type {
- BTRFS_CACHE_NO,
- BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FAST,
- BTRFS_CACHE_FINISHED,
- BTRFS_CACHE_ERROR,
-};
-
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
- struct rb_root root;
- struct mutex lock;
-};
-
/* Discard control. */
/*
* Async discard uses multiple lists to differentiate the discard filter
@@ -546,42 +534,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
-
-/* fs_info */
-struct reloc_control;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-
-/*
- * Block group or device which contains an active swapfile. Used for preventing
- * unsafe operations while a swapfile is active.
- *
- * These are sorted on (ptr, inode) (note that a block group or device can
- * contain more than one swapfile). We compare the pointer values because we
- * don't actually care what the object is, we just need a quick check whether
- * the object exists in the rbtree.
- */
-struct btrfs_swapfile_pin {
- struct rb_node node;
- void *ptr;
- struct inode *inode;
- /*
- * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
- * points to a struct btrfs_device.
- */
- bool is_block_group;
- /*
- * Only used when 'is_block_group' is true and it is the number of
- * extents used by a swapfile for this block group ('ptr' field).
- */
- int bg_extent_count;
-};
-
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
enum {
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
@@ -635,6 +587,9 @@ enum {
/* Indicate we have half completed snapshot deletions pending. */
BTRFS_FS_UNFINISHED_DROPS,
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -656,6 +611,18 @@ enum btrfs_exclusive_operation {
BTRFS_EXCLOP_SWAP_ACTIVATE,
};
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -675,9 +642,8 @@ struct btrfs_fs_info {
rwlock_t global_root_lock;
struct rb_root global_root_tree;
- /* The xarray that holds all the FS roots */
- spinlock_t fs_roots_lock;
- struct xarray fs_roots;
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
rwlock_t block_group_cache_lock;
@@ -851,11 +817,11 @@ struct btrfs_fs_info {
struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
- struct btrfs_workqueue *endio_workers;
- struct btrfs_workqueue *endio_meta_workers;
- struct btrfs_workqueue *endio_raid56_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *endio_raid56_workers;
struct workqueue_struct *rmw_workers;
- struct btrfs_workqueue *endio_meta_write_workers;
+ struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
@@ -874,6 +840,7 @@ struct btrfs_fs_info {
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
@@ -989,16 +956,17 @@ struct btrfs_fs_info {
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
+ u8 qgroup_drop_subtree_thres;
/* filesystem state */
unsigned long fs_state;
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer xarray */
+ /* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
- struct xarray extent_buffers;
+ struct radix_tree_root buffer_radix;
/* next backup root to be overwritten */
int backup_root_index;
@@ -1033,6 +1001,12 @@ struct btrfs_fs_info {
u32 csums_per_leaf;
u32 stripesize;
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
/* Block groups and devices containing active swapfiles. */
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
@@ -1048,6 +1022,8 @@ struct btrfs_fs_info {
*/
u64 zone_size;
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
@@ -1065,6 +1041,26 @@ struct btrfs_fs_info {
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1072,7 +1068,6 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
- struct kobject *discard_debug_kobj;
struct list_head allocated_roots;
spinlock_t eb_leak_lock;
@@ -1080,12 +1075,85 @@ struct btrfs_fs_info {
#endif
};
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
+/*
* The state of btrfs root
*/
enum {
@@ -1119,8 +1187,7 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
- /* The root is tracked in fs_info::fs_roots */
- BTRFS_ROOT_REGISTERED,
+ BTRFS_ROOT_IN_RADIX,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@@ -1144,8 +1211,86 @@ enum {
BTRFS_ROOT_ORPHAN_CLEANUP,
/* This root has a drop operation that was started previously. */
BTRFS_ROOT_UNFINISHED_DROP,
+ /* This reloc root needs to have its buffers lockdep class reset. */
+ BTRFS_ROOT_RESET_LOCKDEP_CLASS,
+};
+
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
};
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
@@ -1224,10 +1369,10 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by inode_lock
+ * radix tree that keeps track of delayed nodes of every inode,
+ * protected by inode_lock
*/
- struct xarray delayed_nodes;
+ struct radix_tree_root delayed_nodes_tree;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -1330,6 +1475,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
@@ -2361,17 +2508,6 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/*
- * For extent tree v2 we overload the extent root with the block group root, as
- * we will have multiple extent roots.
- */
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
- extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
- extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
- struct btrfs_root_backup, extent_root_level, 8);
-
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2475,8 +2611,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
- log_root_transid, 64);
BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
log_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
@@ -2506,13 +2640,6 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
- block_group_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
- struct btrfs_super_block,
- block_group_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
- block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2733,37 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-/*
- * Take the number of bytes to be checksummmed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
- const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
- return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
@@ -3221,12 +3317,9 @@ int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -3237,7 +3330,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit);
+ struct list_head *list, int search_commit,
+ bool nowait);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
@@ -3251,16 +3345,21 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, enum btrfs_compression_type compress_type);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict);
+ u64 *ram_bytes, bool nowait, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -3305,9 +3404,9 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits);
+ u32 bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
@@ -3315,7 +3414,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
@@ -3353,12 +3451,21 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
extern const struct dentry_operations btrfs_dentry_operations;
@@ -3390,15 +3497,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op);
-
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3408,8 +3506,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
@@ -3429,8 +3525,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes);
+ size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3696,21 +3794,26 @@ const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, bool first_hit);
+
+bool __cold abort_should_print_stack(int errno);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
+ * detected, that way the exact stack trace is reported for some errors.
*/
#define btrfs_abort_transaction(trans, errno) \
do { \
+ bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- if ((errno) != -EIO && (errno) != -EROFS) { \
- WARN(1, KERN_DEBUG \
+ first = true; \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
+ (errno))) { \
+ /* Stack trace printed. */ \
} else { \
btrfs_debug((trans)->fs_info, \
"Transaction aborted (error %d)", \
@@ -3718,7 +3821,7 @@ do { \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno)); \
+ __LINE__, (errno), first); \
} while (0)
#ifdef CONFIG_PRINTK_INDEX
@@ -3935,16 +4038,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
-static inline void btrfs_init_full_stripe_locks_tree(
- struct btrfs_full_stripe_locks_tree *locks_root)
-{
- locks_root->root = RB_ROOT;
- mutex_init(&locks_root->lock);
-}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
@@ -3971,6 +4067,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
extern const struct fsverity_operations btrfs_verityops;
int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
encryption, 8);
@@ -3988,6 +4085,12 @@ static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
return 0;
}
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
#endif
/* Sanity test specific functions */
@@ -4004,11 +4107,6 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
- return fs_info->zone_size > 0;
-}
-
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 36ab0859a263..118b2e20b2e1 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
+ struct extent_changeset **reserved, u64 start,
+ u64 len, bool noflush)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
int ret;
/* align the range */
@@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (noflush)
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ else if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ ret = btrfs_reserve_data_bytes(fs_info, len, flush);
if (ret < 0)
return ret;
@@ -273,7 +280,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
- u64 nr_extents = count_max_extents(num_bytes);
+ u64 nr_extents = count_max_extents(fs_info, num_bytes);
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
@@ -350,7 +357,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* needs to free the reservation we just made.
*/
spin_lock(&inode->lock);
- nr_extents = count_max_extents(num_bytes);
+ nr_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
@@ -413,7 +420,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
unsigned num_extents;
spin_lock(&inode->lock);
- num_extents = count_max_extents(num_bytes);
+ num_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, -num_extents);
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -454,7 +461,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
{
int ret;
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 28bf5c3ef430..e07d46043455 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -7,7 +7,8 @@ struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
+ struct extent_changeset **reserved, u64 start, u64 len,
+ bool noflush);
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 66779ab3ed4a..cac5169eaf8d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->p_list);
}
-static inline int btrfs_is_continuous_delayed_item(
- struct btrfs_delayed_item *item1,
- struct btrfs_delayed_item *item2)
-{
- if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
- item1->key.objectid == item2->key.objectid &&
- item1->key.type == item2->key.type &&
- item1->key.offset + 1 == item2->key.offset)
- return 1;
- return 0;
-}
-
static struct btrfs_delayed_node *btrfs_get_delayed_node(
struct btrfs_inode *btrfs_inode)
{
@@ -78,7 +66,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = xa_load(&root->delayed_nodes, ino);
+ node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -90,9 +78,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the xarray. In this case, the refcount
+ * this node from the radix tree. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the xarray at all; our release
+ * NULL like it was never in the radix at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -100,7 +88,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the xarray, we want to bump the
+ * If this node is properly in the radix, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -128,30 +116,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
- do {
- node = btrfs_get_delayed_node(btrfs_inode);
- if (node)
- return node;
+again:
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
- node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
- if (!node)
- return ERR_PTR(-ENOMEM);
- btrfs_init_delayed_node(node, root, ino);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
- /* Cached in the inode and can be accessed */
- refcount_set(&node->refs, 2);
+ /* cached in the btrfs inode and can be accessed */
+ refcount_set(&node->refs, 2);
- spin_lock(&root->inode_lock);
- ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
- if (ret) {
- spin_unlock(&root->inode_lock);
- kmem_cache_free(delayed_node_cache, node);
- if (ret != -EBUSY)
- return ERR_PTR(ret);
- }
- } while (ret);
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(delayed_node_cache, node);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->inode_lock);
+ ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+ if (ret == -EEXIST) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ radix_tree_preload_end();
+ goto again;
+ }
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
return node;
}
@@ -270,7 +264,8 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- xa_erase(&root->delayed_nodes, delayed_node->inode_id);
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -307,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
}
-static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+ struct btrfs_delayed_node *node,
+ enum btrfs_delayed_item_type type)
{
struct btrfs_delayed_item *item;
+
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
if (item) {
item->data_len = data_len;
- item->ins_or_del = 0;
+ item->type = type;
item->bytes_reserved = 0;
- item->delayed_node = NULL;
+ item->delayed_node = node;
+ RB_CLEAR_NODE(&item->rb_node);
+ INIT_LIST_HEAD(&item->log_list);
+ item->logged = false;
refcount_set(&item->refs, 1);
}
return item;
@@ -324,89 +325,46 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
/*
* __btrfs_lookup_delayed_item - look up the delayed item by key
* @delayed_node: pointer to the delayed node
- * @key: the key to look up
- * @prev: used to store the prev item if the right item isn't found
- * @next: used to store the next item if the right item isn't found
+ * @index: the dir index value to lookup (offset of a dir index key)
*
* Note: if we don't find the right item, we will return the prev item and
* the next item.
*/
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
- struct btrfs_key *key,
- struct btrfs_delayed_item **prev,
- struct btrfs_delayed_item **next)
+ u64 index)
{
- struct rb_node *node, *prev_node = NULL;
+ struct rb_node *node = root->rb_node;
struct btrfs_delayed_item *delayed_item = NULL;
- int ret = 0;
-
- node = root->rb_node;
while (node) {
delayed_item = rb_entry(node, struct btrfs_delayed_item,
rb_node);
- prev_node = node;
- ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
- if (ret < 0)
+ if (delayed_item->index < index)
node = node->rb_right;
- else if (ret > 0)
+ else if (delayed_item->index > index)
node = node->rb_left;
else
return delayed_item;
}
- if (prev) {
- if (!prev_node)
- *prev = NULL;
- else if (ret < 0)
- *prev = delayed_item;
- else if ((node = rb_prev(prev_node)) != NULL) {
- *prev = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *prev = NULL;
- }
-
- if (next) {
- if (!prev_node)
- *next = NULL;
- else if (ret > 0)
- *next = delayed_item;
- else if ((node = rb_next(prev_node)) != NULL) {
- *next = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *next = NULL;
- }
return NULL;
}
-static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
- NULL, NULL);
-}
-
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
- struct btrfs_delayed_item *ins,
- int action)
+ struct btrfs_delayed_item *ins)
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
struct btrfs_delayed_item *item;
- int cmp;
bool leftmost = true;
- if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (action == BTRFS_DELAYED_DELETION_ITEM)
- root = &delayed_node->del_root;
else
- BUG();
+ root = &delayed_node->del_root;
+
p = &root->rb_root.rb_node;
node = &ins->rb_node;
@@ -415,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
item = rb_entry(parent_node, struct btrfs_delayed_item,
rb_node);
- cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0) {
+ if (item->index < ins->index) {
p = &(*p)->rb_right;
leftmost = false;
- } else if (cmp > 0) {
+ } else if (item->index > ins->index) {
p = &(*p)->rb_left;
} else {
return -EEXIST;
@@ -428,33 +385,16 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
- ins->delayed_node = delayed_node;
- ins->ins_or_del = action;
- if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
- action == BTRFS_DELAYED_INSERTION_ITEM &&
- ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->index >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->index + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
return 0;
}
-static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_INSERTION_ITEM);
-}
-
-static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_DELETION_ITEM);
-}
-
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
@@ -470,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
- /* Not associated with any delayed_node */
- if (!delayed_item->delayed_node)
+ /* Not inserted, ignore it. */
+ if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
+
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
- BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
- delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
- if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_item->delayed_node->ins_root;
else
root = &delayed_item->delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
+ RB_CLEAR_NODE(&delayed_item->rb_node);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -539,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 num_bytes;
int ret;
@@ -564,9 +503,15 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid,
+ item->delayed_node->inode_id,
num_bytes, 1);
- item->bytes_reserved = num_bytes;
+ /*
+ * For insertions we track reserved metadata space by accounting
+ * for the number of leaves that will be used, based on the delayed
+ * node's index_items_size field.
+ */
+ if (item->type == BTRFS_DELAYED_DELETION_ITEM)
+ item->bytes_reserved = num_bytes;
}
return ret;
@@ -587,11 +532,26 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
* to release/reserve qgroup space.
*/
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, item->bytes_reserved,
- 0);
+ item->delayed_node->inode_id,
+ item->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
+static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
+ unsigned int num_leaves)
+{
+ struct btrfs_fs_info *fs_info = node->root->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
+
+ /* There are no space reservations during log replay, bail out. */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
+ bytes, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
+}
+
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -665,36 +625,78 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * Insert a single delayed item or a batch of delayed items that have consecutive
- * keys if they exist.
+ * Insert a single delayed item or a batch of delayed items, as many as possible
+ * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
+ * in the rbtree, and if there's a gap between two consecutive dir index items,
+ * then it means at some point we had delayed dir indexes to add but they got
+ * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
+ * into the subvolume tree. Dir index keys also have their offsets coming from a
+ * monotonically increasing counter, so we can't get new keys with an offset that
+ * fits within a gap between delayed dir index items.
*/
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_node *node = first_item->delayed_node;
LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
- const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_item_batch batch;
+ struct btrfs_key first_key;
+ const u32 first_data_size = first_item->data_len;
int total_size;
char *ins_data = NULL;
int ret;
+ bool continuous_keys_only = false;
+
+ lockdep_assert_held(&node->mutex);
+
+ /*
+ * During normal operation the delayed index offset is continuously
+ * increasing, so we can batch insert all items as there will not be any
+ * overlapping keys in the tree.
+ *
+ * The exception to this is log replay, where we may have interleaved
+ * offsets in the tree, so our batch needs to be continuous keys only in
+ * order to ensure we do not end up with out of order items in our leaf.
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ continuous_keys_only = true;
+
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(first_item->bytes_reserved == 0);
list_add_tail(&first_item->tree_list, &item_list);
- batch.total_data_size = first_item->data_len;
+ batch.total_data_size = first_data_size;
batch.nr = 1;
- total_size = first_item->data_len + sizeof(struct btrfs_item);
+ total_size = first_data_size + sizeof(struct btrfs_item);
curr = first_item;
while (true) {
int next_size;
next = __btrfs_next_delayed_item(curr);
- if (!next || !btrfs_is_continuous_delayed_item(curr, next))
+ if (!next)
break;
+ /*
+ * We cannot allow gaps in the key space if we're doing log
+ * replay.
+ */
+ if (continuous_keys_only && (next->index != curr->index + 1))
+ break;
+
+ ASSERT(next->bytes_reserved == 0);
+
next_size = next->data_len + sizeof(struct btrfs_item);
if (total_size + next_size > max_size)
break;
@@ -707,8 +709,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
}
if (batch.nr == 1) {
- batch.keys = &first_item->key;
- batch.data_sizes = &first_item->data_len;
+ first_key.objectid = node->inode_id;
+ first_key.type = BTRFS_DIR_INDEX_KEY;
+ first_key.offset = first_item->index;
+ batch.keys = &first_key;
+ batch.data_sizes = &first_data_size;
} else {
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -725,7 +730,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
batch.keys = ins_keys;
batch.data_sizes = ins_sizes;
list_for_each_entry(curr, &item_list, tree_list) {
- ins_keys[i] = curr->key;
+ ins_keys[i].objectid = node->inode_id;
+ ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[i].offset = curr->index;
ins_sizes[i] = curr->data_len;
i++;
}
@@ -751,9 +758,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * For normal operations we will batch an entire leaf's worth of delayed
+ * items, so if there are more items to process we can decrement
+ * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
+ *
+ * However for log replay we may not have inserted an entire leaf's
+ * worth of items, we may have not had continuous items, so decrementing
+ * here would mess up the index_item_leaves accounting. For this case
+ * only clean up the accounting when there are no items left.
+ */
+ if (next && !continuous_keys_only) {
+ /*
+ * We inserted one batch of items into a leaf a there are more
+ * items to flush in a future batch, now release one unit of
+ * metadata space from the delayed block reserve, corresponding
+ * the leaf we just flushed to.
+ */
+ btrfs_delayed_item_release_leaves(node, 1);
+ node->index_item_leaves--;
+ } else if (!next) {
+ /*
+ * There are no more items to insert. We can have a number of
+ * reserved leaves > 1 here - this happens when many dir index
+ * items are added and then removed before they are flushed (file
+ * names with a very short life, never span a transaction). So
+ * release all remaining leaves.
+ */
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
- btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
}
out:
@@ -789,62 +828,77 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ const u64 ino = item->delayed_node->inode_id;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- struct list_head head;
- int nitems, i, last_item;
- int ret = 0;
+ struct extent_buffer *leaf = path->nodes[0];
+ LIST_HEAD(batch_list);
+ int nitems, slot, last_slot;
+ int ret;
+ u64 total_reserved_size = item->bytes_reserved;
- BUG_ON(!path->nodes[0]);
+ ASSERT(leaf != NULL);
- leaf = path->nodes[0];
+ slot = path->slots[0];
+ last_slot = btrfs_header_nritems(leaf) - 1;
+ /*
+ * Our caller always gives us a path pointing to an existing item, so
+ * this can not happen.
+ */
+ ASSERT(slot <= last_slot);
+ if (WARN_ON(slot > last_slot))
+ return -ENOENT;
- i = path->slots[0];
- last_item = btrfs_header_nritems(leaf) - 1;
- if (i > last_item)
- return -ENOENT; /* FIXME: Is errno suitable? */
+ nitems = 1;
+ curr = item;
+ list_add_tail(&curr->tree_list, &batch_list);
- next = item;
- INIT_LIST_HEAD(&head);
- btrfs_item_key_to_cpu(leaf, &key, i);
- nitems = 0;
/*
- * count the number of the dir index items that we can delete in batch
+ * Keep checking if the next delayed item matches the next item in the
+ * leaf - if so, we can add it to the batch of items to delete from the
+ * leaf.
*/
- while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
- list_add_tail(&next->tree_list, &head);
- nitems++;
+ while (slot < last_slot) {
+ struct btrfs_key key;
- curr = next;
next = __btrfs_next_delayed_item(curr);
if (!next)
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
+ slot++;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
break;
-
- i++;
- if (i > last_item)
- break;
- btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems++;
+ curr = next;
+ list_add_tail(&curr->tree_list, &batch_list);
+ total_reserved_size += curr->bytes_reserved;
}
- if (!nitems)
- return 0;
-
ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
if (ret)
- goto out;
+ return ret;
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(root, curr);
+ /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
+ if (total_reserved_size > 0) {
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we
+ * don't need to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+ total_reserved_size, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
+ total_reserved_size, NULL);
+ }
+
+ list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- return ret;
+ return 0;
}
static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
@@ -852,43 +906,57 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
+ struct btrfs_key key;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_deletion_item(node);
- if (!curr)
- goto delete_fail;
+ key.objectid = node->inode_id;
+ key.type = BTRFS_DIR_INDEX_KEY;
+
+ while (ret == 0) {
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_deletion_item(node);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+
+ key.offset = item->index;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ /*
+ * There's no matching item in the leaf. This means we
+ * have already deleted this item in a past run of the
+ * delayed items. We ignore errors when running delayed
+ * items from an async context, through a work queue job
+ * running btrfs_async_run_delayed_root(), and don't
+ * release delayed items that failed to complete. This
+ * is because we will retry later, and at transaction
+ * commit time we always run delayed items and will
+ * then deal with errors if they fail to run again.
+ *
+ * So just release delayed items for which we can't find
+ * an item in the tree, and move to the next item.
+ */
+ btrfs_release_path(path);
+ btrfs_release_delayed_item(item);
+ ret = 0;
+ } else if (ret == 0) {
+ ret = btrfs_batch_delete_items(trans, root, path, item);
+ btrfs_release_path(path);
+ }
- ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- if (ret < 0)
- goto delete_fail;
- else if (ret > 0) {
/*
- * can't find the item which the node points to, so this node
- * is invalid, just drop it.
+ * We unlock and relock on each iteration, this is to prevent
+ * blocking other tasks for too long while we are being run from
+ * the async context (work queue job). Those tasks are typically
+ * running system calls like creat/mkdir/rename/unlink/etc which
+ * need to add delayed items to this delayed node.
*/
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- btrfs_release_delayed_item(prev);
- ret = 0;
- btrfs_release_path(path);
- if (curr) {
- mutex_unlock(&node->mutex);
- goto do_again;
- } else
- goto delete_fail;
+ mutex_unlock(&node->mutex);
}
- btrfs_batch_delete_items(trans, root, path, curr);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-delete_fail:
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return ret;
}
@@ -1347,24 +1415,28 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_disk_key *disk_key, u8 type,
u64 index)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
+ bool reserve_leaf_space;
+ u32 data_len;
int ret;
delayed_node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
- delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+ delayed_node,
+ BTRFS_DELAYED_INSERTION_ITEM);
if (!delayed_item) {
ret = -ENOMEM;
goto release_node;
}
- delayed_item->key.objectid = btrfs_ino(dir);
- delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
- delayed_item->key.offset = index;
+ delayed_item->index = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
@@ -1374,15 +1446,51 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ data_len = delayed_item->data_len + sizeof(struct btrfs_item);
mutex_lock(&delayed_node->mutex);
- ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+
+ if (delayed_node->index_item_leaves == 0 ||
+ delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
+ delayed_node->curr_index_batch_size = data_len;
+ reserve_leaf_space = true;
+ } else {
+ delayed_node->curr_index_batch_size += data_len;
+ reserve_leaf_space = false;
+ }
+
+ if (reserve_leaf_space) {
+ ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
+ /*
+ * Space was reserved for a dir index item insertion when we
+ * started the transaction, so getting a failure here should be
+ * impossible.
+ */
+ if (WARN_ON(ret)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_item(delayed_item);
+ goto release_node;
+ }
+
+ delayed_node->index_item_leaves++;
+ } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+ }
+
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1399,19 +1507,48 @@ release_node:
static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node,
- struct btrfs_key *key)
+ u64 index)
{
struct btrfs_delayed_item *item;
mutex_lock(&node->mutex);
- item = __btrfs_lookup_delayed_insertion_item(node, key);
+ item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
return 1;
}
- btrfs_delayed_item_release_metadata(node->root, item);
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(item->bytes_reserved == 0);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * If there's only one leaf reserved, we can decrement this item from the
+ * current batch, otherwise we can not because we don't know which leaf
+ * it belongs to. With the current limit on delayed items, we rarely
+ * accumulate enough dir index items to fill more than one leaf (even
+ * when using a leaf size of 4K).
+ */
+ if (node->index_item_leaves == 1) {
+ const u32 data_len = item->data_len + sizeof(struct btrfs_item);
+
+ ASSERT(node->curr_index_batch_size >= data_len);
+ node->curr_index_batch_size -= data_len;
+ }
+
btrfs_release_delayed_item(item);
+
+ /* If we now have no more dir index items, we can release all leaves. */
+ if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
mutex_unlock(&node->mutex);
return 0;
}
@@ -1421,31 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
- struct btrfs_key item_key;
int ret;
node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(node))
return PTR_ERR(node);
- item_key.objectid = btrfs_ino(dir);
- item_key.type = BTRFS_DIR_INDEX_KEY;
- item_key.offset = index;
-
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
- &item_key);
+ ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
if (!ret)
goto end;
- item = btrfs_alloc_delayed_item(0);
+ item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
ret = -ENOMEM;
goto end;
}
- item->key = item_key;
+ item->index = index;
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1458,7 +1589,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
mutex_lock(&node->mutex);
- ret = __btrfs_add_delayed_deletion_item(node, item);
+ ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1574,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int ret = 0;
list_for_each_entry(curr, del_list, readdir_list) {
- if (curr->key.offset > index)
+ if (curr->index > index)
break;
- if (curr->key.offset == index) {
+ if (curr->index == index) {
ret = 1;
break;
}
@@ -1610,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < ctx->pos) {
+ if (curr->index < ctx->pos) {
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- ctx->pos = curr->key.offset;
+ ctx->pos = curr->index;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
@@ -1826,12 +1957,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
}
+ if (delayed_node->index_item_leaves > 0) {
+ btrfs_delayed_item_release_leaves(delayed_node,
+ delayed_node->index_item_leaves);
+ delayed_node->index_item_leaves = 0;
+ }
+
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
btrfs_delayed_item_release_metadata(root, curr_item);
@@ -1863,35 +1999,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- unsigned long index = 0;
- struct btrfs_delayed_node *delayed_node;
+ u64 inode_id = 0;
struct btrfs_delayed_node *delayed_nodes[8];
+ int i, n;
while (1) {
- int n = 0;
-
spin_lock(&root->inode_lock);
- if (xa_empty(&root->delayed_nodes)) {
+ n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+ (void **)delayed_nodes, inode_id,
+ ARRAY_SIZE(delayed_nodes));
+ if (!n) {
spin_unlock(&root->inode_lock);
- return;
+ break;
}
- xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
+ inode_id = delayed_nodes[n - 1]->inode_id + 1;
+ for (i = 0; i < n; i++) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (refcount_inc_not_zero(&delayed_node->refs)) {
- delayed_nodes[n] = delayed_node;
- n++;
- }
- if (n >= ARRAY_SIZE(delayed_nodes))
- break;
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
}
- index++;
spin_unlock(&root->inode_lock);
- for (int i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
@@ -1912,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
}
}
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_insertion_item(node);
+ while (item) {
+ /*
+ * It's possible that the item is already in a log list. This
+ * can happen in case two tasks are trying to log the same
+ * directory. For example if we have tasks A and task B:
+ *
+ * Task A collected the delayed items into a log list while
+ * under the inode's log_mutex (at btrfs_log_inode()), but it
+ * only releases the items after logging the inodes they point
+ * to (if they are new inodes), which happens after unlocking
+ * the log mutex;
+ *
+ * Task B enters btrfs_log_inode() and acquires the log_mutex
+ * of the same directory inode, before task B releases the
+ * delayed items. This can happen for example when logging some
+ * inode we need to trigger logging of its parent directory, so
+ * logging two files that have the same parent directory can
+ * lead to this.
+ *
+ * If this happens, just ignore delayed items already in a log
+ * list. All the tasks logging the directory are under a log
+ * transaction and whichever finishes first can not sync the log
+ * before the other completes and leaves the log transaction.
+ */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, ins_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(node);
+ while (item) {
+ /* It may be non-empty, for the same reason mentioned above. */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, del_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_delayed_item *next;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+
+ list_for_each_entry_safe(item, next, ins_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ list_for_each_entry_safe(item, next, del_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index b2412160c5bc..0163ca637a96 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,9 +16,10 @@
#include <linux/refcount.h>
#include "ctree.h"
-/* types of the delayed item */
-#define BTRFS_DELAYED_INSERTION_ITEM 1
-#define BTRFS_DELAYED_DELETION_ITEM 2
+enum btrfs_delayed_item_type {
+ BTRFS_DELAYED_INSERTION_ITEM,
+ BTRFS_DELAYED_DELETION_ITEM
+};
struct btrfs_delayed_root {
spinlock_t lock;
@@ -58,18 +59,42 @@ struct btrfs_delayed_node {
u64 index_cnt;
unsigned long flags;
int count;
+ /*
+ * The size of the next batch of dir index items to insert (if this
+ * node is from a directory inode). Protected by @mutex.
+ */
+ u32 curr_index_batch_size;
+ /*
+ * Number of leaves reserved for inserting dir index items (if this
+ * node belongs to a directory inode). This may be larger then the
+ * actual number of leaves we end up using. Protected by @mutex.
+ */
+ u32 index_item_leaves;
};
struct btrfs_delayed_item {
struct rb_node rb_node;
- struct btrfs_key key;
+ /* Offset value of the corresponding dir index key. */
+ u64 index;
struct list_head tree_list; /* used for batch insert/delete items */
struct list_head readdir_list; /* used for readdir items */
+ /*
+ * Used when logging a directory.
+ * Insertions and deletions to this list are protected by the parent
+ * delayed node's mutex.
+ */
+ struct list_head log_list;
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
refcount_t refs;
- int ins_or_del;
- u32 data_len;
+ enum btrfs_delayed_item_type type:8;
+ /*
+ * Track if this delayed item was already logged.
+ * Protected by the mutex of the parent delayed inode.
+ */
+ bool logged;
+ /* The maximum leaf size is 64K, so u16 is more than enough. */
+ u16 data_len;
char data[];
};
@@ -133,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+
/* for init */
int __init btrfs_delayed_inode_init(void);
void __cold btrfs_delayed_inode_exit(void);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 99f37fca2e96..36a3debe9493 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = 0;
+ delayed_rsv->full = false;
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
}
@@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
if (num_bytes)
delayed_refs_rsv->reserved += num_bytes;
if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = 1;
+ delayed_refs_rsv->full = true;
spin_unlock(&delayed_refs_rsv->lock);
if (num_bytes)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a7dd6ba25e99..61e58066b5fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -165,7 +165,7 @@ no_valid_dev_replace_entry_found:
*/
if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -545,10 +545,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (!cache)
continue;
- spin_lock(&cache->lock);
- cache->to_copy = 1;
- spin_unlock(&cache->lock);
-
+ set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
}
if (iter_ret < 0)
@@ -577,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
return true;
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
return true;
}
@@ -587,7 +584,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
ASSERT(!IS_ERR(em));
map = em->map_lookup;
- num_extents = cur_extent = 0;
+ num_extents = 0;
+ cur_extent = 0;
for (i = 0; i < map->num_stripes; i++) {
/* We have more device extent to copy */
if (srcdev != map->stripes[i].dev)
@@ -609,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
/* Last stripe on this device */
- spin_lock(&cache->lock);
- cache->to_copy = 0;
- spin_unlock(&cache->lock);
+ clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
return true;
}
@@ -1128,8 +1124,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -1288,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
-{
- percpu_counter_inc(&fs_info->dev_replace.bio_counter);
-}
-
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 3911049a5f23..6084b313056a 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -7,6 +7,10 @@
#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ba005c41983..d99bf7c64611 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,6 +5,7 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
+#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@@ -50,7 +51,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -63,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete. This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
- struct bio *bio;
- bio_end_io_t *end_io;
- void *private;
- struct btrfs_fs_info *info;
- blk_status_t status;
- enum btrfs_wq_endio_type metadata;
- struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
- btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
- sizeof(struct btrfs_end_io_wq),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_end_io_wq_cache)
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
- kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
if (fs_info->csum_shash)
@@ -121,88 +87,6 @@ struct async_submit_bio {
};
/*
- * Lockdep class keys for extent_buffer->lock's in this root. For a given
- * eb, the lockdep key is determined by the btrfs_root it belongs to and
- * the level the eb occupies in the tree.
- *
- * Different roots are used for different purposes and may nest inside each
- * other and they require separate keysets. As lockdep keys should be
- * static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->root_key.objectid. This ensures that all special purpose
- * roots have separate keysets.
- *
- * Lock-nesting across peer nodes is always done with the immediate parent
- * node locked thus preventing deadlock. As lockdep doesn't know this, use
- * subclass to avoid triggering lockdep warning in such cases.
- *
- * The key is set by the readpage_end_io_hook after the buffer has passed
- * csum validation but before the pages are unlocked. It is also set by
- * btrfs_init_new_buffer on freshly allocated blocks.
- *
- * We also add a check to make sure the highest level of the tree is the
- * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
- * needs update as well.
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# if BTRFS_MAX_LEVEL != 8
-# error
-# endif
-
-#define DEFINE_LEVEL(stem, level) \
- .names[level] = "btrfs-" stem "-0" #level,
-
-#define DEFINE_NAME(stem) \
- DEFINE_LEVEL(stem, 0) \
- DEFINE_LEVEL(stem, 1) \
- DEFINE_LEVEL(stem, 2) \
- DEFINE_LEVEL(stem, 3) \
- DEFINE_LEVEL(stem, 4) \
- DEFINE_LEVEL(stem, 5) \
- DEFINE_LEVEL(stem, 6) \
- DEFINE_LEVEL(stem, 7)
-
-static struct btrfs_lockdep_keyset {
- u64 id; /* root objectid */
- /* Longest entry: btrfs-free-space-00 */
- char names[BTRFS_MAX_LEVEL][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL];
-} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
- { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
- { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
- { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
- { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
- { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
- { .id = 0, DEFINE_NAME("tree") },
-};
-
-#undef DEFINE_LEVEL
-#undef DEFINE_NAME
-
-void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
- int level)
-{
- struct btrfs_lockdep_keyset *ks;
-
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
-
- /* find the matching keyset, id 0 is the default entry */
- for (ks = btrfs_lockdep_keysets; ks->id; ks++)
- if (ks->id == objectid)
- break;
-
- lockdep_set_class_and_name(&eb->lock,
- &ks->keys[level], ks->names[level]);
-}
-
-#endif
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -247,22 +131,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
btrfs_err_rl(eb->fs_info,
- "parent transid verify failed on %llu wanted %llu found %llu",
- eb->start,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror,
parent_transid, btrfs_header_generation(eb));
ret = 1;
clear_extent_buffer_uptodate(eb);
out:
- unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -283,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type)
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
*/
-static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
- char *raw_disk_sb)
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb)
{
- struct btrfs_super_block *disk_sb =
- (struct btrfs_super_block *)raw_disk_sb;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -298,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
@@ -485,7 +366,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
- /* A dirty eb shouldn't disappear from extent_buffers */
+ /* A dirty eb shouldn't disappear from buffer_radix */
if (WARN_ON(!eb))
return -EUCLEAN;
@@ -586,21 +467,23 @@ static int validate_extent_buffer(struct extent_buffer *eb)
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
- eb->start, found_start);
+ btrfs_err_rl(fs_info,
+ "bad tree block start, mirror %u want %llu have %llu",
+ eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
if (check_tree_block_fsid(eb)) {
- btrfs_err_rl(fs_info, "bad fsid on block %llu",
- eb->start);
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "bad tree block level %d on %llu",
- (int)btrfs_header_level(eb), eb->start);
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
ret = -EIO;
goto out;
}
@@ -611,8 +494,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- eb->start,
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
@@ -637,8 +520,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
set_extent_buffer_uptodate(eb);
else
btrfs_err(fs_info,
- "block=%llu read time tree block corruption detected",
- eb->start);
+ "read time tree block corruption detected on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
out:
return ret;
}
@@ -739,58 +622,6 @@ err:
return ret;
}
-static void end_workqueue_bio(struct bio *bio)
-{
- struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
- struct btrfs_fs_info *fs_info;
- struct btrfs_workqueue *wq;
-
- fs_info = end_io_wq->info;
- end_io_wq->status = bio->bi_status;
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- wq = fs_info->endio_meta_write_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- wq = fs_info->endio_freespace_worker;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else
- wq = fs_info->endio_write_workers;
- } else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
- }
-
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
- btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata)
-{
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
- if (!end_io_wq)
- return BLK_STS_RESOURCE;
-
- end_io_wq->private = bio->bi_private;
- end_io_wq->end_io = bio->bi_end_io;
- end_io_wq->info = info;
- end_io_wq->status = 0;
- end_io_wq->bio = bio;
- end_io_wq->metadata = metadata;
-
- bio->bi_private = end_io_wq;
- bio->bi_end_io = end_workqueue_bio;
- return 0;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -813,17 +644,14 @@ static void run_one_async_start(struct btrfs_work *work)
*/
static void run_one_async_done(struct btrfs_work *work)
{
- struct async_submit_bio *async;
- struct inode *inode;
- blk_status_t ret;
-
- async = container_of(work, struct async_submit_bio, work);
- inode = async->inode;
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
- async->bio->bi_status = async->status;
- bio_endio(async->bio);
+ btrfs_bio_end_io(bbio, async->status);
return;
}
@@ -833,11 +661,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
- if (ret) {
- async->bio->bi_status = ret;
- bio_endio(async->bio);
- }
+ btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -848,16 +672,23 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+/*
+ * Submit bio to an async queue.
+ *
+ * Retrun:
+ * - true if the work has been succesfuly submitted
+ * - false in case of error
+ */
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return BLK_STS_RESOURCE;
+ return false;
async->inode = inode;
async->bio = bio;
@@ -875,7 +706,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
btrfs_queue_work(fs_info->hipri_workers, &async->work);
else
btrfs_queue_work(fs_info->workers, &async->work);
- return 0;
+ return true;
}
static blk_status_t btree_csum_one_bio(struct bio *bio)
@@ -901,7 +732,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
{
/*
* when we're called for a write, we're already in the async
- * submission context. Just jump into btrfs_map_bio
+ * submission context. Just jump into btrfs_submit_bio.
*/
return btree_csum_one_bio(bio);
}
@@ -921,59 +752,56 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
+ bio->bi_opf |= REQ_META;
+
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- /*
- * called for a read, do the setup so that checksum validation
- * can happen in the async kernel threads
- */
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_METADATA);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
- ret = btree_csum_one_bio(bio);
- if (!ret)
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else {
- /*
- * kthread helpers are used to submit writes so that
- * checksumming can happen in parallel across all CPUs
- */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- btree_submit_bio_start);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
}
+ /*
+ * Kthread helpers are used to submit writes so that checksumming can
+ * happen in parallel across all CPUs.
+ */
+ if (should_async_write(fs_info, BTRFS_I(inode)) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ return;
+
+ ret = btree_csum_one_bio(bio);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(bbio, ret);
+ return;
}
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
#ifdef CONFIG_MIGRATION
-static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+static int btree_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
* we haven't done the locking hook
*/
- if (PageDirty(page))
+ if (folio_test_dirty(src))
return -EAGAIN;
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_get_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
+#else
+#define btree_migrate_folio NULL
#endif
-
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1073,10 +901,8 @@ static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.release_folio = btree_release_folio,
.invalidate_folio = btree_invalidate_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = btree_migratepage,
-#endif
- .dirty_folio = btree_dirty_folio,
+ .migrate_folio = btree_migrate_folio,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1158,7 +984,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -1210,9 +1036,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
- spin_lock(&fs_info->fs_roots_lock);
+ spin_lock(&fs_info->fs_roots_radix_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
}
@@ -1659,11 +1485,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
- spin_lock(&fs_info->fs_roots_lock);
- root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
@@ -1692,6 +1519,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
struct btrfs_root *root = btrfs_global_root(fs_info, &key);
@@ -1705,14 +1535,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- spin_lock(&fs_info->fs_roots_lock);
- ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
- root, GFP_NOFS);
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
if (ret == 0) {
btrfs_grab_root(root);
- set_bit(BTRFS_ROOT_REGISTERED, &root->state);
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
return ret;
}
@@ -1864,7 +1700,7 @@ again:
fail:
/*
* If our caller provided us an anonymous device, then it's his
- * responsability to free it in case we fail. So we have to set our
+ * responsibility to free it in case we fail. So we have to set our
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
@@ -1947,25 +1783,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
return root;
}
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions. This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
- struct bio *bio;
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
- bio = end_io_wq->bio;
-
- bio->bi_status = end_io_wq->status;
- bio->bi_private = end_io_wq->private;
- bio->bi_end_io = end_io_wq->end_io;
- bio_endio(bio);
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
static int cleaner_kthread(void *arg)
{
struct btrfs_fs_info *fs_info = arg;
@@ -2161,14 +1978,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
- btrfs_set_backup_block_group_root(root_backup,
- info->block_group_root->node->start);
- btrfs_set_backup_block_group_root_gen(root_backup,
- btrfs_header_generation(info->block_group_root->node));
- btrfs_set_backup_block_group_root_level(root_backup,
- btrfs_header_level(info->block_group_root->node));
- } else {
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
@@ -2272,10 +2082,14 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
- btrfs_destroy_workqueue(fs_info->endio_workers);
- btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2289,8 +2103,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
- btrfs_destroy_workqueue(fs_info->endio_meta_workers);
- btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2342,9 +2156,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
- spin_lock(&root->fs_info->fs_roots_lock);
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
- spin_unlock(&root->fs_info->fs_roots_lock);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
#endif
kfree(root);
}
@@ -2352,21 +2166,28 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root;
- unsigned long index = 0;
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
while (!list_empty(&fs_info->dead_roots)) {
- root = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&root->root_list);
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
- if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
- btrfs_drop_and_free_fs_root(fs_info, root);
- btrfs_put_root(root);
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+ btrfs_put_root(gang[0]);
}
- xa_for_each(&fs_info->fs_roots, index, root) {
- btrfs_drop_and_free_fs_root(fs_info, root);
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
}
@@ -2395,6 +2216,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2408,14 +2231,15 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = false;
+ IO_TREE_BTREE_INODE_IO, NULL);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
- memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
+ BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
+ BTRFS_I(inode)->location.type = 0;
+ BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- btrfs_insert_inode_hash(inode);
+ __insert_inode_hash(inode, hash);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2434,6 +2258,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2462,25 +2287,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
fs_info->endio_workers =
- btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+ alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
- max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
- max_active, 2);
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
- max_active, 4);
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2495,7 +2313,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
if (!(fs_info->workers && fs_info->hipri_workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
+ fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
@@ -2522,6 +2340,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
return 0;
}
@@ -2701,10 +2522,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
return ret;
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
+ }
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2716,7 +2551,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
- btrfs_init_devices_late(fs_info);
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2772,8 +2609,8 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-static int validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2875,6 +2712,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2957,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
- return validate_super(fs_info, fs_info->super_copy, 0);
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}
/*
@@ -2971,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = validate_super(fs_info, sb, -1);
+ ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -3032,17 +2881,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
-
- if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return 0;
-
- bytenr = btrfs_super_block_group_root(sb);
- gen = btrfs_super_block_group_root_generation(sb);
- level = btrfs_super_block_group_root_level(sb);
- ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
- if (ret)
- btrfs_warn(fs_info, "couldn't read block group root");
- return ret;
+ return 0;
}
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
@@ -3054,16 +2893,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- struct btrfs_root *root;
-
- root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
- GFP_KERNEL);
- if (!root)
- return -ENOMEM;
- fs_info->block_group_root = root;
- }
-
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
@@ -3134,8 +2963,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
- xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -3143,7 +2972,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->fs_roots_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@@ -3162,6 +2991,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -3247,6 +3089,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
+
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
@@ -3374,7 +3218,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
- * them into the fs_info->fs_roots. This must be done before
+ * them into the fs_info->fs_roots_radix tree. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@@ -3449,6 +3293,112 @@ out:
return ret;
}
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -3529,7 +3479,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
+ if (btrfs_check_super_csum(fs_info, disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
btrfs_release_disk_super(disk_super);
@@ -3578,16 +3528,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
- /*
- * Flag our filesystem as having big metadata blocks if they are bigger
- * than the page size.
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
@@ -3608,51 +3548,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- features = btrfs_super_incompat_flags(disk_super) &
- ~BTRFS_FEATURE_INCOMPAT_SUPP;
- if (features) {
- btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
-
- features = btrfs_super_incompat_flags(disk_super);
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
- if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- btrfs_info(fs_info, "has skinny extents");
-
- /*
- * mixed block groups end up with duplicate but slightly offset
- * extent buffers for the same range. It leads to corruptions
- */
- if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != nodesize)) {
- btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
- nodesize, sectorsize);
- goto fail_alloc;
- }
-
- /*
- * Needn't use the lock because there is no other task which will
- * update the flag.
- */
- btrfs_set_super_incompat_flags(disk_super, features);
-
- features = btrfs_super_compat_ro_flags(disk_super) &
- ~BTRFS_FEATURE_COMPAT_RO_SUPP;
- if (!sb_rdonly(sb) && features) {
- btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (0x%llx)",
- features);
- err = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0) {
+ err = ret;
goto fail_alloc;
}
@@ -3995,7 +3893,7 @@ static void btrfs_end_super_write(struct bio *bio)
}
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num)
+ int copy_num, bool drop_cache)
{
struct btrfs_super_block *super;
struct page *page;
@@ -4013,6 +3911,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -4044,7 +3955,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i);
+ super = btrfs_read_dev_one_super(bdev, i, false);
if (IS_ERR(super))
continue;
@@ -4499,11 +4410,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
{
bool drop_ref = false;
- spin_lock(&fs_info->fs_roots_lock);
- xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
- if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
@@ -4519,48 +4431,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *roots[8];
- unsigned long index = 0;
- int i;
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i = 0;
int err = 0;
- int grabbed;
+ unsigned int ret = 0;
while (1) {
- struct btrfs_root *root;
-
- spin_lock(&fs_info->fs_roots_lock);
- if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
- spin_unlock(&fs_info->fs_roots_lock);
- return err;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ break;
}
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
- grabbed = 0;
- xa_for_each_start(&fs_info->fs_roots, index, root, index) {
- /* Avoid grabbing roots in dead_roots */
- if (btrfs_root_refs(&root->root_item) > 0)
- roots[grabbed++] = btrfs_grab_root(root);
- if (grabbed >= ARRAY_SIZE(roots))
- break;
+ for (i = 0; i < ret; i++) {
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_root(gang[i]);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
- for (i = 0; i < grabbed; i++) {
- if (!roots[i])
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
continue;
- index = roots[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(roots[i]);
+ root_objectid = gang[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(gang[i]);
if (err)
- goto out;
- btrfs_put_root(roots[i]);
+ break;
+ btrfs_put_root(gang[i]);
}
- index++;
+ root_objectid++;
}
-out:
- /* Release the roots that remain uncleaned due to error */
- for (; i < grabbed; i++) {
- if (roots[i])
- btrfs_put_root(roots[i]);
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_root(gang[i]);
}
return err;
}
@@ -4634,6 +4548,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
/*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ * We must do this before stopping the block group reclaim task, because
+ * at btrfs_relocate_block_group() we wait for this bit, and after the
+ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+ * return 1.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
+ /*
* We may have the reclaim task running and relocating a data block group,
* in which case it may create delayed iputs. So stop it before we park
* the cleaner kthread otherwise we can get new delayed iputs after
@@ -4651,12 +4576,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
- /*
- * If we had UNFINISHED_DROPS we could still be processing them, so
- * clear that bit and wake up relocation so it can stop.
- */
- btrfs_wake_unfinished_drop(fs_info);
-
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -4679,6 +4598,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
@@ -4879,28 +4823,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
- unsigned long index = 0;
- int grabbed = 0;
- struct btrfs_root *roots[8];
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
- spin_lock(&fs_info->fs_roots_lock);
- while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
- ULONG_MAX, 8, XA_PRESENT))) {
- for (int i = 0; i < grabbed; i++)
- roots[i] = btrfs_grab_root(roots[i]);
- spin_unlock(&fs_info->fs_roots_lock);
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
- for (int i = 0; i < grabbed; i++) {
- if (!roots[i])
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
continue;
- index = roots[i]->root_key.objectid;
- btrfs_free_log(NULL, roots[i]);
- btrfs_put_root(roots[i]);
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
}
- index++;
- spin_lock(&fs_info->fs_roots_lock);
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4ee8c42c9f78..9fa923e005a3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,13 +17,6 @@
*/
#define BTRFS_BDEV_BLOCKSIZE (4096)
-enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA,
- BTRFS_WQ_ENDIO_METADATA,
- BTRFS_WQ_ENDIO_FREE_SPACE,
- BTRFS_WQ_ENDIO_RAID56,
-};
-
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = SZ_16K;
@@ -49,14 +42,19 @@ struct extent_buffer *btrfs_find_create_tree_block(
void btrfs_clean_tree_block(struct extent_buffer *buf);
void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num);
+ int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key);
@@ -110,7 +108,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
@@ -121,11 +119,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
int level, struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start);
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
@@ -145,17 +141,5 @@ int btree_lock_page_hook(struct page *page, void *data,
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level);
-#else
-static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level)
-{
-}
-#endif
#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1d4c2397d0d6..fab7eb76e53b 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
}
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f4113c976..5afb7ca42828 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644
index 000000000000..83cb0378096f
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.c
@@ -0,0 +1,1674 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ refcount_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode)
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add_state(state) do {} while (0)
+#define btrfs_leak_debug_del_state(state) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data)
+{
+ tree->fs_info = fs_info;
+ tree->state = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ tree->private_data = private_data;
+ tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add_state(state);
+ refcount_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (refcount_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del_state(state);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return 0;
+ if (set && (state->state & bits) == bits)
+ return 0;
+ if (!set && (state->state & bits) == 0)
+ return 0;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_prev(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @node_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct extent_state *entry = NULL;
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ if (node_ret)
+ *node_ret = node;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ /* Search neighbors until we find the first one past the end */
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+
+ return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct extent_state **prev_ret,
+ struct extent_state **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct extent_state *orig_prev;
+ struct extent_state *entry = NULL;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ entry = rb_entry(*node, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ orig_prev = entry;
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+ *next_ret = entry;
+ entry = orig_prev;
+
+ while (entry && offset < entry->start)
+ entry = prev_state(entry);
+ *prev_ret = entry;
+
+ return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree->fs_info, err,
+ "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range. Any
+ * extents with matching state are merged together into a single extent in the
+ * tree. Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *other;
+
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ return;
+
+ other = prev_state(state);
+ if (other && other->end == state->start - 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ other = next_state(state);
+ if (other && other->start == state->end + 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data, state,
+ other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
+ state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+ const u64 end = state->end;
+
+ set_state_bits(tree, state, bits, changeset);
+
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
+
+ if (tree->private_data)
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
+ return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct. It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, int wake,
+ struct extent_changeset *changeset)
+{
+ struct extent_state *next;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+/*
+ * Clear some bits on a range in the tree. This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ u64 last_end;
+ int err;
+ int clear = 0;
+ int wake;
+ int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ refcount_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+
+ /* This search will find the extents that end after our range starts. */
+ state = tree_search(tree, start);
+ if (!state)
+ goto out;
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* The state doesn't have the wanted bits, go ahead. */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we clear the desired bit
+ * on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+{
+ struct extent_state *state;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * This search will find all the extents that end after our
+ * range starts.
+ */
+ state = tree_search(tree, start);
+process_node:
+ if (!state)
+ break;
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ refcount_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ state = next_state(state);
+ goto process_node;
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ refcount_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held. NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, u32 bits)
+{
+ struct extent_state *state;
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, start);
+ while (state) {
+ if (state->end >= start && (state->state & bits))
+ return state;
+ state = next_state(state);
+ }
+ return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ while ((state = next_state(state)) != NULL) {
+ if (state->state & bits)
+ goto got_it;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'. start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ bool found = false;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ if (!state) {
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (state) {
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ found = true;
+ *end = state->end;
+ cur_start = state->end + 1;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+/*
+ * Set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, changeset);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
+ NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
+
+again:
+ if (!prealloc) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(GFP_NOFS);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going.
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, NULL);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ struct extent_state *prev = NULL, *next;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ state = tree_search_prev_next(tree, start, &prev, &next);
+ if (!state && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!state && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ *start_ret = prev->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!state) {
+ state = next;
+ }
+
+ /*
+ * At this point 'state' either contains 'start' or start is
+ * before 'state'
+ */
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as the
+ * beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev)
+ *start_ret = prev->end + 1;
+ else
+ *start_ret = 0;
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (state) {
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set. This
+ * can be fairly slow, except for EXTENT_DIRTY which is cached. The total
+ * number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end <= cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ while (state) {
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ state = cached;
+ else
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ state = next_state(state);
+ }
+
+ /* We ran out of states and were still inside of our range. */
+ if (filled && !state)
+ bitset = 0;
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
+ GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+ changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ NULL, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ cached_state, NULL, GFP_NOFS);
+ while (err == -EEXIST) {
+ if (failed_start != start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, cached_state);
+
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ &failed_start, cached_state, NULL,
+ GFP_NOFS);
+ }
+ return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index c3eb52dbe61c..a855f40dd61d 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,7 +17,6 @@ struct io_failure_record;
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
@@ -35,10 +34,18 @@ struct io_failure_record;
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
*/
#define EXTENT_ADD_INODE_BYTES (1U << 15)
+
+/*
+ * Set during truncate when we're clearing an entire range and we just want the
+ * extent states to go away.
+ */
+#define EXTENT_CLEAR_ALL_BITS (1U << 16)
+
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
- EXTENT_ADD_INODE_BYTES)
+ EXTENT_ADD_INODE_BYTES | \
+ EXTENT_CLEAR_ALL_BITS)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -56,7 +63,6 @@ enum {
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
@@ -70,8 +76,6 @@ struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -89,33 +93,23 @@ struct extent_state {
refcount_t refs;
u32 state;
- struct io_failure_record *failrec;
-
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
-int __init extent_state_cache_init(void);
-void __cold extent_state_cache_exit(void);
-
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
void extent_io_tree_release(struct extent_io_tree *tree);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
@@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
+ u32 bits, struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+ return __clear_extent_bit(tree, start, end, bits, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
+static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+ return clear_extent_bit(tree, start, end, bits, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, unsigned exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits);
+ u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
- mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
+ EXTENT_DO_ACCOUNTING, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | extra_bits,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
- GFP_NOFS, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- cached_state, mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidate_folio(struct extent_io_tree *tree,
- struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-
-/* This should be reworked in the future and put elsewhere. */
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0867c5cd6e01..2801c991814f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bioc has devices
- * associated to its stripes that don't go away while we are discarding.
+ * Avoid races with device replace and make sure the devices in the
+ * stripes don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_io_stripe *stripe;
+ struct btrfs_discard_stripe *stripes;
+ unsigned int num_stripes;
int i;
num_bytes = end - cur;
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bioc, 0);
- /*
- * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
- * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
- * thus we can't continue anyway.
- */
- if (ret < 0)
- goto out;
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ if (IS_ERR(stripes)) {
+ ret = PTR_ERR(stripes);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ break;
+ }
- stripe = bioc->stripes;
- for (i = 0; i < bioc->num_stripes; i++, stripe++) {
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_discard_stripe *stripe = stripes + i;
u64 bytes;
- struct btrfs_device *device = stripe->dev;
- if (!device->bdev) {
+ if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &stripe->dev->dev_state))
continue;
ret = do_discard_extent(stripe, &bytes);
- if (!ret) {
- discarded_bytes += bytes;
- } else if (ret != -EOPNOTSUPP) {
+ if (ret) {
/*
- * Logic errors or -ENOMEM, or -EIO, but
- * unlikely to happen.
- *
- * And since there are two loops, explicitly
- * go to out to avoid confusion.
+ * Keep going if discard is not supported by the
+ * device.
*/
- btrfs_put_bioc(bioc);
- goto out;
+ if (ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ } else {
+ discarded_bytes += bytes;
}
-
- /*
- * Just in case we get back EOPNOTSUPP for some reason,
- * just ignore the return value so we don't screw up
- * people calling discard_extent.
- */
- ret = 0;
}
- btrfs_put_bioc(bioc);
+ kfree(stripes);
+ if (ret)
+ break;
cur += num_bytes;
}
-out:
btrfs_bio_counter_dec(fs_info);
-
if (actual_bytes)
*actual_bytes = discarded_bytes;
-
-
- if (ret == -EOPNOTSUPP)
- ret = 0;
return ret;
}
@@ -2236,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
+ if (path->nowait) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
@@ -2567,17 +2557,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
return -EINVAL;
/*
- * pull in the free space cache (if any) so that our pin
- * removes the free space from the cache. We have load_only set
- * to one because the slow code to read in the free extents does check
- * the pinned extents.
- */
- btrfs_cache_block_group(cache, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
*/
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret)
goto out;
@@ -2600,12 +2583,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(block_group);
+ ret = btrfs_cache_block_group(block_group, true);
if (ret)
goto out;
@@ -2714,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- down_read(&fs_info->commit_root_sem);
- if (start < cache->last_byte_to_unpin && return_free_space) {
- u64 add_len = min(len, cache->last_byte_to_unpin - start);
-
- btrfs_add_free_space(cache, start, add_len);
- }
- up_read(&fs_info->commit_root_sem);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
start += len;
total_unpinned += len;
@@ -3322,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
/*
- * If this is a leaf and there are tree mod log users, we may
- * have recorded mod log operations that point to this leaf.
- * So we must make sure no one reuses this leaf's extent before
- * mod log operations are applied to a node, otherwise after
- * rewinding a node using the mod log operations we get an
- * inconsistent btree, as the leaf's extent may now be used as
- * a node or leaf for another different btree.
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
* We are safe from races here because at this point no other
* node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins, it will not be able to
- * find a node pointing to this leaf and record operations that
- * point to this leaf.
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
*/
- if (btrfs_header_level(buf) == 0 &&
- test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
@@ -3832,7 +3806,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro) {
+ if (block_group->ro ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
ret = 1;
goto out;
}
@@ -3894,8 +3869,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
- if (ret && ffe_ctl->for_data_reloc)
+ if (ret && ffe_ctl->for_data_reloc &&
+ fs_info->data_reloc_bg == block_group->start) {
+ /*
+ * Do not allow further allocations from this block group.
+ * Compared to increasing the ->ro, setting the
+ * ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ */
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
fs_info->data_reloc_bg = 0;
+ }
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
@@ -3965,23 +3956,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
}
}
-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
- struct find_free_extent_ctl *ffe_ctl)
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ /* If we can activate new zone, just allocate a chunk and use it */
+ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return 0;
+
+ /*
+ * We already reached the max active zones. Try to finish one block
+ * group to make a room for a new block group. This is only possible
+ * for a data block group because btrfs_zone_finish() may need to wait
+ * for a running transaction which can cause a deadlock for metadata
+ * allocation.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int ret = btrfs_zone_finish_one_bg(fs_info);
+
+ if (ret == 1)
+ return 0;
+ else if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * If we have enough free space left in an already active block group
+ * and we can't activate any other zone now, do not allow allocating a
+ * new chunk and let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+ return -ENOSPC;
+
+ /*
+ * Even min_alloc_size is not left in any block groups. Since we cannot
+ * activate a new block group, allocating it may not help. Let's tell a
+ * caller to try again and hope it progress something by writing some
+ * parts of the region. That is only possible for data block groups,
+ * where a part of the region can be written.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+ return -EAGAIN;
+
+ /*
+ * We cannot activate a new block group and no enough space left in any
+ * block groups. So, allocating a new block group may not help. But,
+ * there is nothing to do anyway, so let's go with it.
+ */
+ return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
- return true;
+ return 0;
case BTRFS_EXTENT_ALLOC_ZONED:
- /*
- * If we have enough free space left in an already
- * active block group and we can't activate any other
- * zone now, do not allow allocating a new chunk and
- * let find_free_extent() retry with a smaller size.
- */
- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
- return false;
- return true;
+ return can_allocate_chunk_zoned(fs_info, ffe_ctl);
default:
BUG();
}
@@ -4063,8 +4094,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
int exist = 0;
/*Check if allocation policy allows to create a new chunk */
- if (!can_allocate_chunk(fs_info, ffe_ctl))
- return -ENOSPC;
+ ret = can_allocate_chunk(fs_info, ffe_ctl);
+ if (ret)
+ return ret;
trans = current->journal_info;
if (trans)
@@ -4358,7 +4390,7 @@ have_block_group:
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
- ret = btrfs_cache_block_group(block_group, 0);
+ ret = btrfs_cache_block_group(block_group, false);
/*
* If we get ENOMEM here or something else we want to
@@ -4826,6 +4858,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
+ u64 lockdep_owner = owner;
buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
@@ -4845,11 +4878,29 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ *
+ * The exception however is in replace_path() in relocation, where we
+ * hold the lock on the original fs root and then search for the reloc
+ * root. At that point we need to make sure any reloc root buffers are
+ * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+ * lockdep happy.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ /* btrfs_clean_tree_block() accesses generation field. */
+ btrfs_set_header_generation(buf, trans->transid);
+
+ /*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
* set to the appropriate level and owner.
*/
- btrfs_set_buffer_lockdep_class(owner, buf, level);
+ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -5594,6 +5645,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
+ const bool is_reloc_root = (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5753,6 +5806,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
@@ -5787,7 +5843,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
@@ -5813,12 +5869,15 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
- if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
@@ -5976,7 +6035,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
*/
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
- u64 start = SZ_1M, len = 0, end = 0;
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
*trimmed = 0;
@@ -6020,8 +6079,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
break;
}
- /* Ensure we skip the reserved area in the first 1M */
- start = max_t(u64, start, SZ_1M);
+ /* Ensure we skip the reserved space on each device. */
+ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
/*
* If find_first_clear_extent_bit find a range that spans the
@@ -6112,13 +6171,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
- ret = btrfs_cache_block_group(cache, 0);
- if (ret) {
- bg_failed++;
- bg_ret = ret;
- continue;
- }
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
bg_ret = ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f6b544ae616..4dcf22e051ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -31,38 +31,27 @@
#include "block-group.h"
#include "compression.h"
-static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set btrfs_bioset;
-
-static inline bool extent_state_in_tree(const struct extent_state *state)
-{
- return !RB_EMPTY_NODE(&state->rb_node);
-}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(states);
-static DEFINE_SPINLOCK(leak_lock);
-
-static inline void btrfs_leak_debug_add(spinlock_t *lock,
- struct list_head *new,
- struct list_head *head)
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_add(new, head);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_add(&eb->leak_list, &fs_info->allocated_ebs);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-static inline void btrfs_leak_debug_del(spinlock_t *lock,
- struct list_head *entry)
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_del(entry);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
@@ -91,62 +80,22 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-
-static inline void btrfs_extent_state_leak_debug_check(void)
-{
- struct extent_state *state;
-
- while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
- state->start, state->end, state->state,
- extent_state_in_tree(state),
- refcount_read(&state->refs));
- list_del(&state->leak_list);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-#define btrfs_debug_check_extent_io_range(tree, start, end) \
- __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
-static inline void __btrfs_debug_check_extent_io_range(const char *caller,
- struct extent_io_tree *tree, u64 start, u64 end)
-{
- struct inode *inode = tree->private_data;
- u64 isize;
-
- if (!inode || !is_data_inode(inode))
- return;
-
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
-}
#else
-#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
-#define btrfs_leak_debug_del(lock, entry) do {} while (0)
-#define btrfs_extent_state_leak_debug_check() do {} while (0)
-#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#define btrfs_leak_debug_add_eb(eb) do {} while (0)
+#define btrfs_leak_debug_del_eb(eb) do {} while (0)
#endif
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
-
/*
* Structure to record info about the bio being assembled, and other info like
* how many bytes are there before stripe/ordered extent boundary.
*/
struct btrfs_bio_ctrl {
struct bio *bio;
+ int mirror_num;
enum btrfs_compression_type compress_type;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
+ btrfs_bio_end_io_t end_io_func;
};
struct extent_page_data {
@@ -160,93 +109,59 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, u32 bits,
- struct extent_changeset *changeset,
- int set)
+static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
- int ret;
-
- if (!changeset)
- return 0;
- if (set && (state->state & bits) == bits)
- return 0;
- if (!set && (state->state & bits) == 0)
- return 0;
- changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(&changeset->range_changed, state->start, state->end,
- GFP_ATOMIC);
- return ret;
-}
+ struct bio *bio;
+ struct bio_vec *bv;
+ struct inode *inode;
+ int mirror_num;
-static void submit_one_bio(struct bio *bio, int mirror_num,
- enum btrfs_compression_type compress_type)
-{
- struct extent_io_tree *tree = bio->bi_private;
+ if (!bio_ctrl->bio)
+ return;
- bio->bi_private = NULL;
+ bio = bio_ctrl->bio;
+ bv = bio_first_bvec_all(bio);
+ inode = bv->bv_page->mapping->host;
+ mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
- if (is_data_inode(tree->private_data))
- btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
- compress_type);
- else
- btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
- /*
- * Above submission hooks will handle the error by ending the bio,
- * which will do the cleanup properly. So here we should not return
- * any error, or the caller of submit_extent_page() will do cleanup
- * again, causing problems.
- */
-}
+ btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
-/* Cleanup unsubmitted bios */
-static void end_write_bio(struct extent_page_data *epd, int ret)
-{
- struct bio *bio = epd->bio_ctrl.bio;
+ if (!is_data_inode(inode))
+ btrfs_submit_metadata_bio(inode, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(inode, bio, mirror_num);
+ else
+ btrfs_submit_data_read_bio(inode, bio, mirror_num,
+ bio_ctrl->compress_type);
- if (bio) {
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- epd->bio_ctrl.bio = NULL;
- }
+ /* The bio is owned by the end_io handler now */
+ bio_ctrl->bio = NULL;
}
/*
- * Submit bio from extent page data via submit_one_bio
- *
- * Return 0 if everything is OK.
- * Return <0 for error.
+ * Submit or fail the current bio in an extent_page_data structure.
*/
-static void flush_write_bio(struct extent_page_data *epd)
+static void submit_write_bio(struct extent_page_data *epd, int ret)
{
struct bio *bio = epd->bio_ctrl.bio;
- if (bio) {
- submit_one_bio(bio, 0, 0);
- /*
- * Clean up of epd->bio is handled by its endio function.
- * And endio is either triggered by successful bio execution
- * or the error handler of submit bio hook.
- * So at this point, no matter what happened, we don't need
- * to clean up epd->bio.
- */
+ if (!bio)
+ return;
+
+ if (ret) {
+ ASSERT(ret < 0);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ /* The bio is owned by the end_io handler now */
epd->bio_ctrl.bio = NULL;
+ } else {
+ submit_one_bio(&epd->bio_ctrl);
}
}
-int __init extent_state_cache_init(void)
-{
- extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!extent_state_cache)
- return -ENOMEM;
- return 0;
-}
-
-int __init extent_io_init(void)
+int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
@@ -254,32 +169,10 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
return -ENOMEM;
- if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_bio, bio),
- BIOSET_NEED_BVECS))
- goto free_buffer_cache;
-
- if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
- goto free_bioset;
-
return 0;
-
-free_bioset:
- bioset_exit(&btrfs_bioset);
-
-free_buffer_cache:
- kmem_cache_destroy(extent_buffer_cache);
- extent_buffer_cache = NULL;
- return -ENOMEM;
}
-void __cold extent_state_cache_exit(void)
-{
- btrfs_extent_state_leak_debug_check();
- kmem_cache_destroy(extent_state_cache);
-}
-
-void __cold extent_io_exit(void)
+void __cold extent_buffer_free_cachep(void)
{
/*
* Make sure all delayed rcu free are flushed before we
@@ -287,1222 +180,6 @@ void __cold extent_io_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
- bioset_exit(&btrfs_bioset);
-}
-
-/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
- */
-static struct lock_class_key file_extent_tree_class;
-
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data)
-{
- tree->fs_info = fs_info;
- tree->state = RB_ROOT;
- tree->dirty_bytes = 0;
- spin_lock_init(&tree->lock);
- tree->private_data = private_data;
- tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
-}
-
-void extent_io_tree_release(struct extent_io_tree *tree)
-{
- spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- /*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
- */
- ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
-
- cond_resched_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
-}
-
-static struct extent_state *alloc_extent_state(gfp_t mask)
-{
- struct extent_state *state;
-
- /*
- * The given mask might be not appropriate for the slab allocator,
- * drop the unsupported bits
- */
- mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
- state = kmem_cache_alloc(extent_state_cache, mask);
- if (!state)
- return state;
- state->state = 0;
- state->failrec = NULL;
- RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
- refcount_set(&state->refs, 1);
- init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
- return state;
-}
-
-void free_extent_state(struct extent_state *state)
-{
- if (!state)
- return;
- if (refcount_dec_and_test(&state->refs)) {
- WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&leak_lock, &state->leak_list);
- trace_free_extent_state(state, _RET_IP_);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-static struct rb_node *tree_insert(struct rb_root *root,
- struct rb_node *search_start,
- u64 offset,
- struct rb_node *node,
- struct rb_node ***p_in,
- struct rb_node **parent_in)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- if (p_in && parent_in) {
- p = *p_in;
- parent = *parent_in;
- goto do_insert;
- }
-
- p = search_start ? &search_start : &root->rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- p = &(*p)->rb_left;
- else if (offset > entry->end)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
-do_insert:
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
-/**
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
- *
- * @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
- * @next_ret: pointer to the first entry whose range ends after @offset
- * @prev_ret: pointer to the first entry whose range begins before @offset
- * @p_ret: pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret: points to entry which would have been the parent of the entry,
- * containing @offset
- *
- * This function returns a pointer to the entry that contains @offset byte
- * address. If no such entry exists, then NULL is returned and the other
- * pointer arguments to the function are filled, otherwise the found entry is
- * returned and other pointers are left untouched.
- */
-static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **next_ret,
- struct rb_node **prev_ret,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **n = &root->rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
- struct tree_entry *entry;
- struct tree_entry *prev_entry = NULL;
-
- while (*n) {
- prev = *n;
- entry = rb_entry(prev, struct tree_entry, rb_node);
- prev_entry = entry;
-
- if (offset < entry->start)
- n = &(*n)->rb_left;
- else if (offset > entry->end)
- n = &(*n)->rb_right;
- else
- return *n;
- }
-
- if (p_ret)
- *p_ret = n;
- if (parent_ret)
- *parent_ret = prev;
-
- if (next_ret) {
- orig_prev = prev;
- while (prev && offset > prev_entry->end) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
- }
-
- if (prev_ret) {
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
- }
- return NULL;
-}
-
-static inline struct rb_node *
-tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
-{
- struct rb_node *next= NULL;
- struct rb_node *ret;
-
- ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
- if (!ret)
- return next;
- return ret;
-}
-
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
- u64 offset)
-{
- return tree_search_for_insert(tree, offset, NULL, NULL);
-}
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree. Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static void merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
-{
- struct extent_state *other;
- struct rb_node *other_node;
-
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- return;
-
- other_node = rb_prev(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
- other_node = rb_next(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, u32 *bits,
- struct extent_changeset *changeset);
-
-/*
- * insert an extent_state struct into the tree. 'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally. This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state, u64 start, u64 end,
- struct rb_node ***p,
- struct rb_node **parent,
- u32 *bits, struct extent_changeset *changeset)
-{
- struct rb_node *node;
-
- if (end < start) {
- btrfs_err(tree->fs_info,
- "insert state: end < start %llu %llu", end, start);
- WARN_ON(1);
- }
- state->start = start;
- state->end = end;
-
- set_state_bits(tree, state, bits, changeset);
-
- node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
- if (node) {
- struct extent_state *found;
- found = rb_entry(node, struct extent_state, rb_node);
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- found->start, found->end, start, end);
- return -EEXIST;
- }
- merge_state(tree, state);
- return 0;
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half. 'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end]. After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
- struct extent_state *prealloc, u64 split)
-{
- struct rb_node *node;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_split_delalloc_extent(tree->private_data, orig, split);
-
- prealloc->start = orig->start;
- prealloc->end = split - 1;
- prealloc->state = orig->state;
- orig->start = split;
-
- node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
- &prealloc->rb_node, NULL, NULL);
- if (node) {
- free_extent_state(prealloc);
- return -EEXIST;
- }
- return 0;
-}
-
-static struct extent_state *next_state(struct extent_state *state)
-{
- struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up anyone waiting on this state (wake == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 *bits, int wake,
- struct extent_changeset *changeset)
-{
- struct extent_state *next;
- u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
- int ret;
-
- if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- WARN_ON(range > tree->dirty_bytes);
- tree->dirty_bytes -= range;
- }
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_clear_delalloc_extent(tree->private_data, state, bits);
-
- ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
- BUG_ON(ret < 0);
- state->state &= ~bits_to_clear;
- if (wake)
- wake_up(&state->wq);
- if (state->state == 0) {
- next = next_state(state);
- if (extent_state_in_tree(state)) {
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
- } else {
- WARN_ON(1);
- }
- } else {
- merge_state(tree, state);
- next = next_state(state);
- }
- return next;
-}
-
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
-{
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
-
- return prealloc;
-}
-
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
-{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
-}
-
-/*
- * clear some bits on a range in the tree. This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns 0 on success and < 0 on error.
- */
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *cached;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- u64 last_end;
- int err;
- int clear = 0;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
-
- if (bits & EXTENT_DELALLOC)
- bits |= EXTENT_NORESERVE;
-
- if (delete)
- bits |= ~EXTENT_CTLBITS;
-
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- clear = 1;
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state) {
- cached = *cached_state;
-
- if (clear) {
- *cached_state = NULL;
- cached_state = NULL;
- }
-
- if (cached && extent_state_in_tree(cached) &&
- cached->start <= start && cached->end > start) {
- if (clear)
- refcount_dec(&cached->refs);
- state = cached;
- goto hit_next;
- }
- if (clear)
- free_extent_state(cached);
- }
- /*
- * this search will find the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- if (state->start > end)
- goto out;
- WARN_ON(state->end < start);
- last_end = state->end;
-
- /* the state doesn't have the wanted bits, go ahead */
- if (!(state->state & bits)) {
- state = next_state(state);
- goto next;
- }
-
- /*
- * | ---- desired range ---- |
- * | state | or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip
- * bits on second half.
- *
- * If the extent we found extends past our range, we
- * just split and search again. It'll get split again
- * the next time though.
- *
- * If the extent we found is inside our range, we clear
- * the desired bit on it.
- */
-
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake,
- changeset);
- goto next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and clear the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- if (wake)
- wake_up(&state->wq);
-
- clear_state_bit(tree, prealloc, &bits, wake, changeset);
-
- prealloc = NULL;
- goto out;
- }
-
- state = clear_state_bit(tree, state, &bits, wake, changeset);
-next:
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start <= end && state && !need_resched())
- goto hit_next;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
-
-}
-
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
-
- spin_lock(&tree->lock);
-again:
- while (1) {
- /*
- * this search will find all the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
-process_node:
- if (!node)
- break;
-
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (state->start > end)
- goto out;
-
- if (state->state & bits) {
- start = state->start;
- refcount_inc(&state->refs);
- wait_on_state(tree, state);
- free_extent_state(state);
- goto again;
- }
- start = state->end + 1;
-
- if (start > end)
- break;
-
- if (!cond_resched_lock(&tree->lock)) {
- node = rb_next(node);
- goto process_node;
- }
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 *bits, struct extent_changeset *changeset)
-{
- u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
- int ret;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_set_delalloc_extent(tree->private_data, state, bits);
-
- if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- tree->dirty_bytes += range;
- }
- ret = add_extent_changeset(state, bits_to_set, changeset, 1);
- BUG_ON(ret < 0);
- state->state |= bits_to_set;
-}
-
-static void cache_state_if_flags(struct extent_state *state,
- struct extent_state **cached_ptr,
- unsigned flags)
-{
- if (cached_ptr && !(*cached_ptr)) {
- if (!flags || (state->state & flags)) {
- *cached_ptr = state;
- refcount_inc(&state->refs);
- }
- }
-}
-
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
-{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
-}
-
-/*
- * set some bits on a range in the tree. This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set. The start of the
- * existing range is returned in failed_start in this case.
- *
- * [start, end] is inclusive This takes the tree lock.
- */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- u32 exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
-
- if (exclusive_bits)
- ASSERT(failed_start);
- else
- ASSERT(failed_start == NULL);
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- if (state->state & exclusive_bits) {
- *failed_start = state->start;
- err = -EEXIST;
- goto out;
- }
-
- set_state_bits(tree, state, &bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- /*
- * If this extent already has all the bits we want set, then
- * skip it, not necessary to split it or do anything with it.
- */
- if ((state->state & bits) == bits) {
- start = state->end + 1;
- cache_state(state, cached_state);
- goto search_again;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, &bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, &bits, changeset);
- cache_state(prealloc, cached_state);
- merge_state(tree, prealloc);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-
-}
-
-/**
- * convert_extent_bit - convert all bits in a given range from one bit to
- * another
- * @tree: the io tree to search
- * @start: the start offset in bytes
- * @end: the end offset in bytes (inclusive)
- * @bits: the bits to set in this range
- * @clear_bits: the bits to clear in this range
- * @cached_state: state that we're going to cache
- *
- * This will go through and set bits for the given range. If any states exist
- * already in this range they are set with the given bit and cleared of the
- * clear_bits. This is only meant to be used by things that are mergeable, ie
- * converting from say DELALLOC to DIRTY. This is not meant to be used with
- * boundary bits like LOCK.
- *
- * All allocations are done with GFP_NOFS.
- */
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
- bool first_iteration = true;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
- clear_bits);
-
-again:
- if (!prealloc) {
- /*
- * Best effort, don't worry if extent state allocation fails
- * here for the first iteration. We might have a cached state
- * that matches exactly the target range, in which case no
- * extent state allocations are needed. We'll only know this
- * after locking the tree.
- */
- prealloc = alloc_extent_state(GFP_NOFS);
- if (!prealloc && !first_iteration)
- return -ENOMEM;
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0,
- NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, &bits, NULL);
- cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- cond_resched();
- first_iteration = false;
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-}
-
-/* wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.