aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c201
-rw-r--r--fs/9p/cache.h25
-rw-r--r--fs/9p/fid.c31
-rw-r--r--fs/9p/v9fs.c23
-rw-r--r--fs/9p/v9fs.h24
-rw-r--r--fs/9p/vfs_addr.c190
-rw-r--r--fs/9p/vfs_dir.c13
-rw-r--r--fs/9p/vfs_file.c9
-rw-r--r--fs/9p/vfs_inode.c52
-rw-r--r--fs/9p/vfs_inode_dotl.c35
-rw-r--r--fs/9p/vfs_super.c3
-rw-r--r--fs/Kconfig38
-rw-r--r--fs/Kconfig.binfmt21
-rw-r--r--fs/Makefile4
-rw-r--r--fs/adfs/inode.c17
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c27
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/affs/symlink.c5
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c68
-rw-r--r--fs/afs/callback.c2
-rw-r--r--fs/afs/cell.c12
-rw-r--r--fs/afs/dir.c62
-rw-r--r--fs/afs/dir_edit.c10
-rw-r--r--fs/afs/dir_silly.c4
-rw-r--r--fs/afs/dynroot.c1
-rw-r--r--fs/afs/file.c136
-rw-r--r--fs/afs/fs_operation.c6
-rw-r--r--fs/afs/inode.c162
-rw-r--r--fs/afs/internal.h75
-rw-r--r--fs/afs/main.c14
-rw-r--r--fs/afs/misc.c5
-rw-r--r--fs/afs/proc.c6
-rw-r--r--fs/afs/rotate.c4
-rw-r--r--fs/afs/rxrpc.c8
-rw-r--r--fs/afs/security.c3
-rw-r--r--fs/afs/super.c14
-rw-r--r--fs/afs/volume.c32
-rw-r--r--fs/afs/write.c132
-rw-r--r--fs/aio.c35
-rw-r--r--fs/attr.c26
-rw-r--r--fs/befs/linuxvfs.c19
-rw-r--r--fs/bfs/file.c14
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c180
-rw-r--r--fs/binfmt_elf_fdpic.c20
-rw-r--r--fs/binfmt_elf_test.c64
-rw-r--r--fs/binfmt_flat.c232
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c39
-rw-r--r--fs/btrfs/async-thread.c122
-rw-r--r--fs/btrfs/async-thread.h7
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/block-group.c355
-rw-r--r--fs/btrfs/block-group.h13
-rw-r--r--fs/btrfs/block-rsv.c84
-rw-r--r--fs/btrfs/block-rsv.h5
-rw-r--r--fs/btrfs/btrfs_inode.h94
-rw-r--r--fs/btrfs/check-integrity.c173
-rw-r--r--fs/btrfs/check-integrity.h6
-rw-r--r--fs/btrfs/compression.c142
-rw-r--r--fs/btrfs/compression.h14
-rw-r--r--fs/btrfs/ctree.c763
-rw-r--r--fs/btrfs/ctree.h433
-rw-r--r--fs/btrfs/delalloc-space.c25
-rw-r--r--fs/btrfs/delayed-inode.c87
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c88
-rw-r--r--fs/btrfs/dir-item.c43
-rw-r--r--fs/btrfs/disk-io.c1017
-rw-r--r--fs/btrfs/disk-io.h23
-rw-r--r--fs/btrfs/extent-io-tree.h4
-rw-r--r--fs/btrfs/extent-tree.c392
-rw-r--r--fs/btrfs/extent_io.c842
-rw-r--r--fs/btrfs/extent_io.h49
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c109
-rw-r--r--fs/btrfs/file.c574
-rw-r--r--fs/btrfs/free-space-cache.c333
-rw-r--r--fs/btrfs/free-space-cache.h10
-rw-r--r--fs/btrfs/free-space-tree.c58
-rw-r--r--fs/btrfs/inode-item.c344
-rw-r--r--fs/btrfs/inode-item.h96
-rw-r--r--fs/btrfs/inode.c3790
-rw-r--r--fs/btrfs/ioctl.c1043
-rw-r--r--fs/btrfs/locking.c3
-rw-r--r--fs/btrfs/lzo.c20
-rw-r--r--fs/btrfs/ordered-data.c132
-rw-r--r--fs/btrfs/ordered-data.h25
-rw-r--r--fs/btrfs/print-tree.c13
-rw-r--r--fs/btrfs/props.c106
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c134
-rw-r--r--fs/btrfs/qgroup.h12
-rw-r--r--fs/btrfs/raid56.c809
-rw-r--r--fs/btrfs/raid56.h9
-rw-r--r--fs/btrfs/reada.c1086
-rw-r--r--fs/btrfs/ref-verify.c8
-rw-r--r--fs/btrfs/reflink.c88
-rw-r--r--fs/btrfs/relocation.c95
-rw-r--r--fs/btrfs/root-tree.c24
-rw-r--r--fs/btrfs/scrub.c2053
-rw-r--r--fs/btrfs/send.c834
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/space-info.c109
-rw-r--r--fs/btrfs/space-info.h10
-rw-r--r--fs/btrfs/subpage.c57
-rw-r--r--fs/btrfs/subpage.h2
-rw-r--r--fs/btrfs/super.c153
-rw-r--r--fs/btrfs/sysfs.c72
-rw-r--r--fs/btrfs/tests/btrfs-tests.c25
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c17
-rw-r--r--fs/btrfs/tests/extent-io-tests.c52
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/tests/free-space-tests.c186
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c5
-rw-r--r--fs/btrfs/tests/qgroup-tests.c5
-rw-r--r--fs/btrfs/transaction.c382
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c171
-rw-r--r--fs/btrfs/tree-checker.h1
-rw-r--r--fs/btrfs/tree-defrag.c8
-rw-r--r--fs/btrfs/tree-log.c1620
-rw-r--r--fs/btrfs/tree-log.h7
-rw-r--r--fs/btrfs/uuid-tree.c10
-rw-r--r--fs/btrfs/verity.c2
-rw-r--r--fs/btrfs/volumes.c479
-rw-r--r--fs/btrfs/volumes.h65
-rw-r--r--fs/btrfs/xattr.c59
-rw-r--r--fs/btrfs/zoned.c497
-rw-r--r--fs/btrfs/zoned.h58
-rw-r--r--fs/btrfs/zstd.c14
-rw-r--r--fs/buffer.c369
-rw-r--r--fs/cachefiles/Kconfig19
-rw-r--r--fs/cachefiles/Makefile7
-rw-r--r--fs/cachefiles/bind.c278
-rw-r--r--fs/cachefiles/cache.c383
-rw-r--r--fs/cachefiles/daemon.c292
-rw-r--r--fs/cachefiles/error_inject.c46
-rw-r--r--fs/cachefiles/interface.c747
-rw-r--r--fs/cachefiles/internal.h334
-rw-r--r--fs/cachefiles/io.c435
-rw-r--r--fs/cachefiles/key.c201
-rw-r--r--fs/cachefiles/main.c22
-rw-r--r--fs/cachefiles/namei.c1232
-rw-r--r--fs/cachefiles/ondemand.c503
-rw-r--r--fs/cachefiles/rdwr.c972
-rw-r--r--fs/cachefiles/security.c2
-rw-r--r--fs/cachefiles/volume.c139
-rw-r--r--fs/cachefiles/xattr.c436
-rw-r--r--fs/ceph/addr.c555
-rw-r--r--fs/ceph/cache.c220
-rw-r--r--fs/ceph/cache.h103
-rw-r--r--fs/ceph/caps.c280
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c167
-rw-r--r--fs/ceph/inode.c136
-rw-r--r--fs/ceph/locks.c8
-rw-r--r--fs/ceph/mds_client.c203
-rw-r--r--fs/ceph/mds_client.h17
-rw-r--r--fs/ceph/metric.c65
-rw-r--r--fs/ceph/metric.h63
-rw-r--r--fs/ceph/quota.c36
-rw-r--r--fs/ceph/snap.c271
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c190
-rw-r--r--fs/ceph/super.h87
-rw-r--r--fs/ceph/xattr.c37
-rw-r--r--fs/cifs/Makefile6
-rw-r--r--fs/cifs/cache.c105
-rw-r--r--fs/cifs/cifs_debug.c33
-rw-r--r--fs/cifs/cifs_spnego.c4
-rw-r--r--fs/cifs/cifs_spnego.h3
-rw-r--r--fs/cifs/cifs_swn.c13
-rw-r--r--fs/cifs/cifsacl.c9
-rw-r--r--fs/cifs/cifsencrypt.c14
-rw-r--r--fs/cifs/cifsfs.c87
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h312
-rw-r--r--fs/cifs/cifspdu.h16
-rw-r--r--fs/cifs/cifsproto.h58
-rw-r--r--fs/cifs/cifssmb.c100
-rw-r--r--fs/cifs/connect.c583
-rw-r--r--fs/cifs/dfs_cache.c117
-rw-r--r--fs/cifs/dir.c5
-rw-r--r--fs/cifs/file.c383
-rw-r--r--fs/cifs/fs_context.c83
-rw-r--r--fs/cifs/fs_context.h4
-rw-r--r--fs/cifs/fscache.c416
-rw-r--r--fs/cifs/fscache.h195
-rw-r--r--fs/cifs/inode.c40
-rw-r--r--fs/cifs/link.c3
-rw-r--r--fs/cifs/misc.c92
-rw-r--r--fs/cifs/netmisc.c5
-rw-r--r--fs/cifs/ntlmssp.h32
-rw-r--r--fs/cifs/readdir.c179
-rw-r--r--fs/cifs/sess.c493
-rw-r--r--fs/cifs/smb1ops.c28
-rw-r--r--fs/cifs/smb2glob.h13
-rw-r--r--fs/cifs/smb2inode.c7
-rw-r--r--fs/cifs/smb2misc.c41
-rw-r--r--fs/cifs/smb2ops.c535
-rw-r--r--fs/cifs/smb2pdu.c367
-rw-r--r--fs/cifs/smb2pdu.h582
-rw-r--r--fs/cifs/smb2proto.h8
-rw-r--r--fs/cifs/smb2transport.c72
-rw-r--r--fs/cifs/smbdirect.c8
-rw-r--r--fs/cifs/trace.h47
-rw-r--r--fs/cifs/transport.c141
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/coda/symlink.c7
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/configfs/dir.c20
-rw-r--r--fs/coredump.c167
-rw-r--r--fs/cramfs/README8
-rw-r--r--fs/cramfs/inode.c7
-rw-r--r--fs/crypto/bio.c13
-rw-r--r--fs/crypto/crypto.c20
-rw-r--r--fs/crypto/fname.c11
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/inline_crypt.c126
-rw-r--r--fs/crypto/keyring.c64
-rw-r--r--fs/crypto/keysetup.c22
-rw-r--r--fs/crypto/policy.c132
-rw-r--r--fs/dax.c276
-rw-r--r--fs/dcache.c40
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/debugfs/inode.c10
-rw-r--r--fs/devpts/inode.c2
-rw-r--r--fs/direct-io.c40
-rw-r--r--fs/dlm/ast.c16
-rw-r--r--fs/dlm/debug_fs.c96
-rw-r--r--fs/dlm/dir.c5
-rw-r--r--fs/dlm/dlm_internal.h78
-rw-r--r--fs/dlm/lock.c757
-rw-r--r--fs/dlm/lock.h4
-rw-r--r--fs/dlm/lockspace.c53
-rw-r--r--fs/dlm/lockspace.h1
-rw-r--r--fs/dlm/lowcomms.c221
-rw-r--r--fs/dlm/lowcomms.h6
-rw-r--r--fs/dlm/main.c3
-rw-r--r--fs/dlm/member.c14
-rw-r--r--fs/dlm/memory.c68
-rw-r--r--fs/dlm/memory.h6
-rw-r--r--fs/dlm/midcomms.c144
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/dlm/plock.c178
-rw-r--r--fs/dlm/rcom.c122
-rw-r--r--fs/dlm/recover.c49
-rw-r--r--fs/dlm/recoverd.c3
-rw-r--r--fs/dlm/requestqueue.c37
-rw-r--r--fs/dlm/user.c16
-rw-r--r--fs/dlm/util.c92
-rw-r--r--fs/dlm/util.h8
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ecryptfs/mmap.c20
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efs/inode.c8
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/efs/symlink.c5
-rw-r--r--fs/erofs/Kconfig10
-rw-r--r--fs/erofs/Makefile3
-rw-r--r--fs/erofs/compress.h4
-rw-r--r--fs/erofs/data.c183
-rw-r--r--fs/erofs/decompressor.c141
-rw-r--r--fs/erofs/decompressor_lzma.c19
-rw-r--r--fs/erofs/dir.c21
-rw-r--r--fs/erofs/erofs_fs.h73
-rw-r--r--fs/erofs/fscache.c520
-rw-r--r--fs/erofs/inode.c88
-rw-r--r--fs/erofs/internal.h133
-rw-r--r--fs/erofs/namei.c59
-rw-r--r--fs/erofs/super.c389
-rw-r--r--fs/erofs/sysfs.c258
-rw-r--r--fs/erofs/xattr.c135
-rw-r--r--fs/erofs/xattr.h1
-rw-r--r--fs/erofs/zdata.c580
-rw-r--r--fs/erofs/zdata.h68
-rw-r--r--fs/erofs/zmap.c237
-rw-r--r--fs/eventpoll.c10
-rw-r--r--fs/exec.c101
-rw-r--r--fs/exfat/balloc.c10
-rw-r--r--fs/exfat/dir.c42
-rw-r--r--fs/exfat/exfat_fs.h16
-rw-r--r--fs/exfat/fatent.c49
-rw-r--r--fs/exfat/file.c25
-rw-r--r--fs/exfat/inode.c28
-rw-r--r--fs/exfat/misc.c13
-rw-r--r--fs/exfat/namei.c134
-rw-r--r--fs/exfat/nls.c2
-rw-r--r--fs/exfat/super.c49
-rw-r--r--fs/exportfs/expfs.c5
-rw-r--r--fs/ext2/dir.c9
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c44
-rw-r--r--fs/ext2/super.c24
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/acl.c10
-rw-r--r--fs/ext4/balloc.c1
-rw-r--r--fs/ext4/block_validity.c26
-rw-r--r--fs/ext4/crypto.c246
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h148
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c87
-rw-r--r--fs/ext4/fast_commit.c486
-rw-r--r--fs/ext4/fast_commit.h33
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inline.c96
-rw-r--r--fs/ext4/inode.c361
-rw-r--r--fs/ext4/ioctl.c417
-rw-r--r--fs/ext4/mballoc.c492
-rw-r--r--fs/ext4/migrate.c25
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c18
-rw-r--r--fs/ext4/namei.c262
-rw-r--r--fs/ext4/orphan.c4
-rw-r--r--fs/ext4/page-io.c33
-rw-r--r--fs/ext4/readpage.c20
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c2326
-rw-r--r--fs/ext4/symlink.c51
-rw-r--r--fs/ext4/sysfs.c44
-rw-r--r--fs/ext4/verity.c9
-rw-r--r--fs/ext4/xattr.c9
-rw-r--r--fs/f2fs/Kconfig8
-rw-r--r--fs/f2fs/acl.c21
-rw-r--r--fs/f2fs/checkpoint.c99
-rw-r--r--fs/f2fs/compress.c103
-rw-r--r--fs/f2fs/data.c850
-rw-r--r--fs/f2fs/debug.c43
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/f2fs.h368
-rw-r--r--fs/f2fs/file.c944
-rw-r--r--fs/f2fs/gc.c268
-rw-r--r--fs/f2fs/hash.c13
-rw-r--r--fs/f2fs/inline.c37
-rw-r--r--fs/f2fs/inode.c64
-rw-r--r--fs/f2fs/iostat.c37
-rw-r--r--fs/f2fs/namei.c129
-rw-r--r--fs/f2fs/node.c175
-rw-r--r--fs/f2fs/node.h4
-rw-r--r--fs/f2fs/recovery.c47
-rw-r--r--fs/f2fs/segment.c655
-rw-r--r--fs/f2fs/segment.h48
-rw-r--r--fs/f2fs/super.c318
-rw-r--r--fs/f2fs/sysfs.c79
-rw-r--r--fs/f2fs/verity.c15
-rw-r--r--fs/f2fs/xattr.c52
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h14
-rw-r--r--fs/fat/fatent.c7
-rw-r--r--fs/fat/file.c24
-rw-r--r--fs/fat/inode.c44
-rw-r--r--fs/fat/misc.c78
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fcntl.c27
-rw-r--r--fs/file.c202
-rw-r--r--fs/file_table.c63
-rw-r--r--fs/freevxfs/vxfs.h27
-rw-r--r--fs/freevxfs/vxfs_bmap.c26
-rw-r--r--fs/freevxfs/vxfs_dir.h27
-rw-r--r--fs/freevxfs/vxfs_extern.h27
-rw-r--r--fs/freevxfs/vxfs_fshead.c26
-rw-r--r--fs/freevxfs/vxfs_fshead.h27
-rw-r--r--fs/freevxfs/vxfs_immed.c41
-rw-r--r--fs/freevxfs/vxfs_inode.c26
-rw-r--r--fs/freevxfs/vxfs_inode.h27
-rw-r--r--fs/freevxfs/vxfs_lookup.c26
-rw-r--r--fs/freevxfs/vxfs_olt.c26
-rw-r--r--fs/freevxfs/vxfs_olt.h27
-rw-r--r--fs/freevxfs/vxfs_subr.c43
-rw-r--r--fs/freevxfs/vxfs_super.c28
-rw-r--r--fs/fs-writeback.c134
-rw-r--r--fs/fs_context.c2
-rw-r--r--fs/fs_parser.c31
-rw-r--r--fs/fscache/Makefile6
-rw-r--r--fs/fscache/cache.c618
-rw-r--r--fs/fscache/cookie.c1450
-rw-r--r--fs/fscache/fsdef.c98
-rw-r--r--fs/fscache/internal.h314
-rw-r--r--fs/fscache/io.c377
-rw-r--r--fs/fscache/main.c147
-rw-r--r--fs/fscache/netfs.c74
-rw-r--r--fs/fscache/object.c1125
-rw-r--r--fs/fscache/operation.c633
-rw-r--r--fs/fscache/page.c1242
-rw-r--r--fs/fscache/proc.c47
-rw-r--r--fs/fscache/stats.c293
-rw-r--r--fs/fscache/volume.c517
-rw-r--r--fs/fsopen.c4
-rw-r--r--fs/fuse/Kconfig2
-rw-r--r--fs/fuse/control.c17
-rw-r--r--fs/fuse/dax.c43
-rw-r--r--fs/fuse/dev.c20
-rw-r--r--fs/fuse/dir.c103
-rw-r--r--fs/fuse/file.c52
-rw-r--r--fs/fuse/fuse_i.h34
-rw-r--r--fs/fuse/inode.c94
-rw-r--r--fs/fuse/ioctl.c11
-rw-r--r--fs/fuse/virtio_fs.c47
-rw-r--r--fs/gfs2/aops.c124
-rw-r--r--fs/gfs2/bmap.c20
-rw-r--r--fs/gfs2/file.c184
-rw-r--r--fs/gfs2/glock.c56
-rw-r--r--fs/gfs2/glock.h12
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/inode.c51
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/lock_dlm.c15
-rw-r--r--fs/gfs2/lops.c9
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h8
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/quota.c62
-rw-r--r--fs/gfs2/recovery.c22
-rw-r--r--fs/gfs2/rgrp.c28
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c12
-rw-r--r--fs/gfs2/sys.c5
-rw-r--r--fs/hfs/extent.c6
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c44
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/hfsplus_raw.h12
-rw-r--r--fs/hfsplus/inode.c44
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hfsplus/wrapper.c5
-rw-r--r--fs/hfsplus/xattr.c4
-rw-r--r--fs/hostfs/hostfs_kern.c17
-rw-r--r--fs/hpfs/file.c13
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c126
-rw-r--r--fs/inode.c92
-rw-r--r--fs/internal.h45
-rw-r--r--fs/io-wq.c199
-rw-r--r--fs/io-wq.h25
-rw-r--r--fs/io_uring.c5806
-rw-r--r--fs/ioctl.c8
-rw-r--r--fs/iomap/Makefile4
-rw-r--r--fs/iomap/buffered-io.c690
-rw-r--r--fs/iomap/direct-io.c47
-rw-r--r--fs/iomap/fiemap.c1
-rw-r--r--fs/iomap/trace.h4
-rw-r--r--fs/isofs/compress.c5
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/isofs/rock.c7
-rw-r--r--fs/jbd2/commit.c39
-rw-r--r--fs/jbd2/journal.c23
-rw-r--r--fs/jbd2/transaction.c140
-rw-r--r--fs/jffs2/background.c2
-rw-r--r--fs/jffs2/build.c4
-rw-r--r--fs/jffs2/erase.c6
-rw-r--r--fs/jffs2/file.c61
-rw-r--r--fs/jffs2/fs.c5
-rw-r--r--fs/jffs2/gc.c2
-rw-r--r--fs/jffs2/jffs2_fs_i.h4
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c6
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/inode.c35
-rw-r--r--fs/jfs/ioctl.c5
-rw-r--r--fs/jfs/jfs_dmap.c78
-rw-r--r--fs/jfs/jfs_dtree.c298
-rw-r--r--fs/jfs/jfs_extent.c255
-rw-r--r--fs/jfs/jfs_logmgr.c19
-rw-r--r--fs/jfs/jfs_metapage.c42
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c34
-rw-r--r--fs/jfs/jfs_xtree.c961
-rw-r--r--fs/jfs/jfs_xtree.h4
-rw-r--r--fs/jfs/super.c10
-rw-r--r--fs/kernfs/dir.c163
-rw-r--r--fs/kernfs/file.c62
-rw-r--r--fs/kernfs/inode.c22
-rw-r--r--fs/kernfs/kernfs-internal.h19
-rw-r--r--fs/kernfs/mount.c15
-rw-r--r--fs/kernfs/symlink.c5
-rw-r--r--fs/ksmbd/asn1.c142
-rw-r--r--fs/ksmbd/auth.c54
-rw-r--r--fs/ksmbd/auth.h10
-rw-r--r--fs/ksmbd/connection.c30
-rw-r--r--fs/ksmbd/connection.h39
-rw-r--r--fs/ksmbd/ksmbd_netlink.h15
-rw-r--r--fs/ksmbd/mgmt/user_config.c10
-rw-r--r--fs/ksmbd/mgmt/user_config.h1
-rw-r--r--fs/ksmbd/mgmt/user_session.h1
-rw-r--r--fs/ksmbd/misc.c50
-rw-r--r--fs/ksmbd/misc.h3
-rw-r--r--fs/ksmbd/ndr.c2
-rw-r--r--fs/ksmbd/ntlmssp.h6
-rw-r--r--fs/ksmbd/oplock.c34
-rw-r--r--fs/ksmbd/oplock.h2
-rw-r--r--fs/ksmbd/server.c2
-rw-r--r--fs/ksmbd/smb2misc.c18
-rw-r--r--fs/ksmbd/smb2ops.c19
-rw-r--r--fs/ksmbd/smb2pdu.c599
-rw-r--r--fs/ksmbd/smb2pdu.h538
-rw-r--r--fs/ksmbd/smb_common.c9
-rw-r--r--fs/ksmbd/smb_common.h1
-rw-r--r--fs/ksmbd/smbacl.c20
-rw-r--r--fs/ksmbd/smbacl.h5
-rw-r--r--fs/ksmbd/transport_ipc.c5
-rw-r--r--fs/ksmbd/transport_rdma.c608
-rw-r--r--fs/ksmbd/transport_rdma.h10
-rw-r--r--fs/ksmbd/transport_tcp.c9
-rw-r--r--fs/ksmbd/vfs.c19
-rw-r--r--fs/ksmbd/vfs.h1
-rw-r--r--fs/ksmbd/vfs_cache.c2
-rw-r--r--fs/ksmbd/vfs_cache.h11
-rw-r--r--fs/ksmbd/xattr.h2
-rw-r--r--fs/libfs.c43
-rw-r--r--fs/lockd/svc.c212
-rw-r--r--fs/lockd/svclock.c6
-rw-r--r--fs/lockd/svcsubs.c18
-rw-r--r--fs/locks.c95
-rw-r--r--fs/minix/inode.c19
-rw-r--r--fs/mpage.c116
-rw-r--r--fs/namei.c202
-rw-r--r--fs/namespace.c324
-rw-r--r--fs/netfs/Makefile8
-rw-r--r--fs/netfs/buffered_read.c424
-rw-r--r--fs/netfs/internal.h50
-rw-r--r--fs/netfs/io.c657
-rw-r--r--fs/netfs/main.c20
-rw-r--r--fs/netfs/objects.c160
-rw-r--r--fs/netfs/read_helper.c1204
-rw-r--r--fs/netfs/stats.c1
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c26
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c1
-rw-r--r--fs/nfs/callback.c96
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c32
-rw-r--r--fs/nfs/callback_xdr.c22
-rw-r--r--fs/nfs/client.c14
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/dir.c808
-rw-r--r--fs/nfs/direct.c69
-rw-r--r--fs/nfs/export.c2
-rw-r--r--fs/nfs/file.c186
-rw-r--r--fs/nfs/filelayout/filelayout.c9
-rw-r--r--fs/nfs/filelayout/filelayout.h2
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c4
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c53
-rw-r--r--fs/nfs/fs_context.c11
-rw-r--r--fs/nfs/fscache-index.c140
-rw-r--r--fs/nfs/fscache.c508
-rw-r--r--fs/nfs/fscache.h191
-rw-r--r--fs/nfs/inode.c109
-rw-r--r--fs/nfs/internal.h37
-rw-r--r--fs/nfs/nfs2xdr.c3
-rw-r--r--fs/nfs/nfs3proc.c5
-rw-r--r--fs/nfs/nfs3xdr.c30
-rw-r--r--fs/nfs/nfs42proc.c47
-rw-r--r--fs/nfs/nfs42xattr.c9
-rw-r--r--fs/nfs/nfs4_fs.h15
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c19
-rw-r--r--fs/nfs/nfs4namespace.c28
-rw-r--r--fs/nfs/nfs4proc.c468
-rw-r--r--fs/nfs/nfs4state.c96
-rw-r--r--fs/nfs/nfs4xdr.c155
-rw-r--r--fs/nfs/nfstrace.h222
-rw-r--r--fs/nfs/pagelist.c14
-rw-r--r--fs/nfs/pnfs.c73
-rw-r--r--fs/nfs/pnfs.h3
-rw-r--r--fs/nfs/pnfs_nfs.c8
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c53
-rw-r--r--fs/nfs/super.c28
-rw-r--r--fs/nfs/symlink.c16
-rw-r--r--fs/nfs/sysfs.c3
-rw-r--r--fs/nfs/unlink.c9
-rw-r--r--fs/nfs/write.c127
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/export.c2
-rw-r--r--fs/nfsd/filecache.c178
-rw-r--r--fs/nfsd/filecache.h3
-rw-r--r--fs/nfsd/flexfilelayout.c2
-rw-r--r--fs/nfsd/netns.h27
-rw-r--r--fs/nfsd/nfs2acl.c24
-rw-r--r--fs/nfsd/nfs3proc.c177
-rw-r--r--fs/nfsd/nfs3xdr.c69
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c301
-rw-r--r--fs/nfsd/nfs4state.c440
-rw-r--r--fs/nfsd/nfs4xdr.c43
-rw-r--r--fs/nfsd/nfscache.c35
-rw-r--r--fs/nfsd/nfsctl.c62
-rw-r--r--fs/nfsd/nfsd.h9
-rw-r--r--fs/nfsd/nfsfh.c62
-rw-r--r--fs/nfsd/nfsfh.h26
-rw-r--r--fs/nfsd/nfsproc.c18
-rw-r--r--fs/nfsd/nfssvc.c241
-rw-r--r--fs/nfsd/state.h36
-rw-r--r--fs/nfsd/stats.c2
-rw-r--r--fs/nfsd/stats.h4
-rw-r--r--fs/nfsd/trace.h261
-rw-r--r--fs/nfsd/vfs.c449
-rw-r--r--fs/nfsd/vfs.h23
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/btnode.c23
-rw-r--r--fs/nilfs2/btnode.h1
-rw-r--r--fs/nilfs2/btree.c27
-rw-r--r--fs/nilfs2/dat.c4
-rw-r--r--fs/nilfs2/gcinode.c7
-rw-r--r--fs/nilfs2/inode.c226
-rw-r--r--fs/nilfs2/ioctl.c6
-rw-r--r--fs/nilfs2/mdt.c46
-rw-r--r--fs/nilfs2/mdt.h6
-rw-r--r--fs/nilfs2/nilfs.h16
-rw-r--r--fs/nilfs2/page.c20
-rw-r--r--fs/nilfs2/page.h1
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segbuf.c63
-rw-r--r--fs/nilfs2/segment.c9
-rw-r--r--fs/nilfs2/sufile.c4
-rw-r--r--fs/nilfs2/super.c7
-rw-r--r--fs/nilfs2/sysfs.c13
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/dnotify/dnotify.c36
-rw-r--r--fs/notify/fanotify/fanotify.c231
-rw-r--r--fs/notify/fanotify/fanotify.h154
-rw-r--r--fs/notify/fanotify/fanotify_user.c237
-rw-r--r--fs/notify/fdinfo.c21
-rw-r--r--fs/notify/fsnotify.c152
-rw-r--r--fs/notify/group.c34
-rw-r--r--fs/notify/inotify/inotify.h19
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c58
-rw-r--r--fs/notify/mark.c137
-rw-r--r--fs/ntfs/aops.c61
-rw-r--r--fs/ntfs/aops.h6
-rw-r--r--fs/ntfs/attrib.c4
-rw-r--r--fs/ntfs/compress.c4
-rw-r--r--fs/ntfs/file.c8
-rw-r--r--fs/ntfs/inode.c10
-rw-r--r--fs/ntfs/mft.h2
-rw-r--r--fs/ntfs3/file.c25
-rw-r--r--fs/ntfs3/frecord.c10
-rw-r--r--fs/ntfs3/fslog.c12
-rw-r--r--fs/ntfs3/fsntfs.c36
-rw-r--r--fs/ntfs3/inode.c38
-rw-r--r--fs/ntfs3/ntfs_fs.h6
-rw-r--r--fs/ntfs3/super.c22
-rw-r--r--fs/ntfs3/xattr.c136
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/aops.c55
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/masklog.c11
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dir.c6
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c18
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c21
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c17
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/ocfs2/filecheck.c3
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/ocfs2/journal.c39
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/localalloc.c6
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/quota_global.c25
-rw-r--r--fs/ocfs2/quota_local.c12
-rw-r--r--fs/ocfs2/refcounttree.c6
-rw-r--r--fs/ocfs2/reservations.c4
-rw-r--r--fs/ocfs2/reservations.h9
-rw-r--r--fs/ocfs2/stack_user.c18
-rw-r--r--fs/ocfs2/stackglue.c36
-rw-r--r--fs/ocfs2/suballoc.c25
-rw-r--r--fs/ocfs2/super.c202
-rw-r--r--fs/ocfs2/symlink.c5
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/file.c14
-rw-r--r--fs/open.c84
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/inode.c171
-rw-r--r--fs/orangefs/orangefs-bufmap.c7
-rw-r--r--fs/orangefs/orangefs-sysfs.c21
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/copy_up.c106
-rw-r--r--fs/overlayfs/dir.c147
-rw-r--r--fs/overlayfs/export.c5
-rw-r--r--fs/overlayfs/file.c56
-rw-r--r--fs/overlayfs/inode.c68
-rw-r--r--fs/overlayfs/namei.c53
-rw-r--r--fs/overlayfs/overlayfs.h232
-rw-r--r--fs/overlayfs/ovl_entry.h7
-rw-r--r--fs/overlayfs/readdir.c48
-rw-r--r--fs/overlayfs/super.c59
-rw-r--r--fs/overlayfs/util.c103
-rw-r--r--fs/pipe.c112
-rw-r--r--fs/posix_acl.c27
-rw-r--r--fs/proc/array.c10
-rw-r--r--fs/proc/base.c35
-rw-r--r--fs/proc/bootconfig.c2
-rw-r--r--fs/proc/cpuinfo.c6
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/generic.c9
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c14
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/page.c1
-rw-r--r--fs/proc/proc_net.c30
-rw-r--r--fs/proc/proc_sysctl.c163
-rw-r--r--fs/proc/task_mmu.c65
-rw-r--r--fs/proc/vmcore.c183
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/ftrace.c46
-rw-r--r--fs/pstore/platform.c38
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c9
-rw-r--r--fs/qnx6/inode.c8
-rw-r--r--fs/quota/dquot.c21
-rw-r--r--fs/read_write.c128
-rw-r--r--fs/reiserfs/Kconfig10
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c92
-rw-r--r--fs/reiserfs/journal.c25
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/remap_range.c129
-rw-r--r--fs/romfs/super.c11
-rw-r--r--fs/select.c64
-rw-r--r--fs/seq_file.c36
-rw-r--r--fs/signalfd.c5
-rw-r--r--fs/smbfs_common/smb2pdu.h735
-rw-r--r--fs/smbfs_common/smbfsctl.h8
-rw-r--r--fs/splice.c24
-rw-r--r--fs/squashfs/block.c19
-rw-r--r--fs/squashfs/file.c5
-rw-r--r--fs/squashfs/super.c35
-rw-r--r--fs/squashfs/symlink.c5
-rw-r--r--fs/stat.c70
-rw-r--r--fs/super.c28
-rw-r--r--fs/sync.c27
-rw-r--r--fs/sysctls.c39
-rw-r--r--fs/sysfs/file.c13
-rw-r--r--fs/sysfs/mount.c2
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/itree.c13
-rw-r--r--fs/sysv/super.c4
-rw-r--r--fs/tracefs/inode.c31
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/budget.c7
-rw-r--r--fs/ubifs/dir.c242
-rw-r--r--fs/ubifs/file.c85
-rw-r--r--fs/ubifs/gc.c19
-rw-r--r--fs/ubifs/io.c55
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c52
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c27
-rw-r--r--fs/ubifs/sysfs.c154
-rw-r--r--fs/ubifs/ubifs.h39
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/file.c17
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c22
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/udf/symlink.c5
-rw-r--r--fs/ufs/inode.c16
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/unicode/.gitignore2
-rw-r--r--fs/unicode/Kconfig7
-rw-r--r--fs/unicode/Makefile19
-rw-r--r--fs/unicode/mkutf8data.c24
-rw-r--r--fs/unicode/utf8-core.c109
-rw-r--r--fs/unicode/utf8-norm.c262
-rw-r--r--fs/unicode/utf8-selftest.c94
-rw-r--r--fs/unicode/utf8data.c_shipped (renamed from fs/unicode/utf8data.h_shipped)22
-rw-r--r--fs/unicode/utf8n.h81
-rw-r--r--fs/userfaultfd.c45
-rw-r--r--fs/vboxsf/file.c7
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/vboxsf/utils.c1
-rw-r--r--fs/verity/Kconfig1
-rw-r--r--fs/verity/enable.c33
-rw-r--r--fs/verity/fsverity_private.h13
-rw-r--r--fs/verity/measure.c43
-rw-r--r--fs/verity/open.c12
-rw-r--r--fs/verity/read_metadata.c5
-rw-r--r--fs/verity/verify.c4
-rw-r--r--fs/xattr.c143
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/kmem.c3
-rw-r--r--fs/xfs/libxfs/xfs_ag.c3
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c40
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1683
-rw-r--r--fs/xfs/libxfs/xfs_attr.h195
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c171
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h58
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_btree.c187
-rw-r--r--fs/xfs/libxfs/xfs_btree.h26
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_defer.c97
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c44
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h5
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h8
-rw-r--r--fs/xfs/libxfs/xfs_format.h189
-rw-r--r--fs/xfs/libxfs/xfs_fs.h78
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c118
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c51
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h76
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h87
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h16
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c75
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c14
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h13
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c161
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_sb.c80
-rw-r--r--fs/xfs/libxfs/xfs_shared.h24
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c225
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h16
-rw-r--r--fs/xfs/libxfs/xfs_types.h11
-rw-r--r--fs/xfs/scrub/agheader.c53
-rw-r--r--fs/xfs/scrub/agheader_repair.c12
-rw-r--r--fs/xfs/scrub/attr.h2
-rw-r--r--fs/xfs/scrub/bmap.c26
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/dir.c15
-rw-r--r--fs/xfs/scrub/inode.c34
-rw-r--r--fs/xfs/scrub/quota.c4
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/rtbitmap.c9
-rw-r--r--fs/xfs/scrub/scrub.c21
-rw-r--r--fs/xfs/scrub/scrub.h1
-rw-r--r--fs/xfs/xfs_acl.c7
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_aops.c61
-rw-r--r--fs/xfs/xfs_attr_item.c885
-rw-r--r--fs/xfs/xfs_attr_item.h54
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bio_io.c45
-rw-r--r--fs/xfs/xfs_bmap_item.c29
-rw-r--r--fs/xfs/xfs_bmap_util.c50
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c68
-rw-r--r--fs/xfs/xfs_buf.h47
-rw-r--r--fs/xfs/xfs_buf_item.c5
-rw-r--r--fs/xfs/xfs_buf_item.h24
-rw-r--r--fs/xfs/xfs_buf_item_recover.c68
-rw-r--r--fs/xfs/xfs_dir2_readdir.c53
-rw-r--r--fs/xfs/xfs_discard.c8
-rw-r--r--fs/xfs/xfs_dquot.c97
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_error.c12
-rw-r--r--fs/xfs/xfs_error.h20
-rw-r--r--fs/xfs/xfs_extfree_item.c25
-rw-r--r--fs/xfs/xfs_file.c124
-rw-r--r--fs/xfs/xfs_filestream.c7
-rw-r--r--fs/xfs/xfs_fsmap.c10
-rw-r--r--fs/xfs/xfs_fsops.c74
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c50
-rw-r--r--fs/xfs/xfs_icreate_item.c1
-rw-r--r--fs/xfs/xfs_inode.c212
-rw-r--r--fs/xfs/xfs_inode.h40
-rw-r--r--fs/xfs/xfs_inode_item.c222
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_inode_item_recover.c145
-rw-r--r--fs/xfs/xfs_ioctl.c120
-rw-r--r--fs/xfs/xfs_ioctl.h11
-rw-r--r--fs/xfs/xfs_ioctl32.c31
-rw-r--r--fs/xfs/xfs_ioctl32.h22
-rw-r--r--fs/xfs/xfs_iomap.c117
-rw-r--r--fs/xfs/xfs_iomap.h12
-rw-r--r--fs/xfs/xfs_iops.c172
-rw-r--r--fs/xfs/xfs_itable.c15
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_iwalk.h2
-rw-r--r--fs/xfs/xfs_linux.h3
-rw-r--r--fs/xfs/xfs_log.c892
-rw-r--r--fs/xfs/xfs_log.h97
-rw-r--r--fs/xfs/xfs_log_cil.c437
-rw-r--r--fs/xfs/xfs_log_priv.h106
-rw-r--r--fs/xfs/xfs_log_recover.c175
-rw-r--r--fs/xfs/xfs_message.c58
-rw-r--r--fs/xfs/xfs_message.h61
-rw-r--r--fs/xfs/xfs_mount.c103
-rw-r--r--fs/xfs/xfs_mount.h65
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_pnfs.c49
-rw-r--r--fs/xfs/xfs_qm.c26
-rw-r--r--fs/xfs/xfs_qm.h5
-rw-r--r--fs/xfs/xfs_qm_syscalls.c37
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c27
-rw-r--r--fs/xfs/xfs_reflink.c113
-rw-r--r--fs/xfs/xfs_rmap_item.c27
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_super.c155
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_symlink.c38
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c40
-rw-r--r--fs/xfs/xfs_trace.h108
-rw-r--r--fs/xfs/xfs_trans.c201
-rw-r--r--fs/xfs/xfs_trans.h46
-rw-r--r--fs/xfs/xfs_trans_ail.c51
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_xattr.c96
-rw-r--r--fs/xfs/xfs_xattr.h13
-rw-r--r--fs/zonefs/Makefile2
-rw-r--r--fs/zonefs/super.c359
-rw-r--r--fs/zonefs/sysfs.c139
-rw-r--r--fs/zonefs/zonefs.h18
953 files changed, 56887 insertions, 45643 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index f2ba131cede1..cebba4eaa0b5 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -16,186 +16,59 @@
#include "v9fs.h"
#include "cache.h"
-#define CACHETAG_LEN 11
-
-struct fscache_netfs v9fs_cache_netfs = {
- .name = "9p",
- .version = 0,
-};
-
-/*
- * v9fs_random_cachetag - Generate a random tag to be associated
- * with a new cache session.
- *
- * The value of jiffies is used for a fairly randomly cache tag.
- */
-
-static
-int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
+int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
+ const char *dev_name)
{
- v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
- if (!v9ses->cachetag)
- return -ENOMEM;
+ struct fscache_volume *vcookie;
+ char *name, *p;
- return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
-}
-
-const struct fscache_cookie_def v9fs_cache_session_index_def = {
- .name = "9P.session",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
+ name = kasprintf(GFP_KERNEL, "9p,%s,%s",
+ dev_name, v9ses->cachetag ?: v9ses->aname);
+ if (!name)
+ return -ENOMEM;
-void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
-{
- /* If no cache session tag was specified, we generate a random one. */
- if (!v9ses->cachetag) {
- if (v9fs_random_cachetag(v9ses) < 0) {
- v9ses->fscache = NULL;
- kfree(v9ses->cachetag);
- v9ses->cachetag = NULL;
- return;
+ for (p = name; *p; p++)
+ if (*p == '/')
+ *p = ';';
+
+ vcookie = fscache_acquire_volume(name, NULL, NULL, 0);
+ p9_debug(P9_DEBUG_FSC, "session %p get volume %p (%s)\n",
+ v9ses, vcookie, name);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ kfree(name);
+ return PTR_ERR(vcookie);
}
+ pr_err("Cache volume key already in use (%s)\n", name);
+ vcookie = NULL;
}
-
- v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
- &v9fs_cache_session_index_def,
- v9ses->cachetag,
- strlen(v9ses->cachetag),
- NULL, 0,
- v9ses, 0, true);
- p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
- v9ses, v9ses->fscache);
-}
-
-void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
-{
- p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
- v9ses, v9ses->fscache);
- fscache_relinquish_cookie(v9ses->fscache, NULL, false);
- v9ses->fscache = NULL;
+ v9ses->fscache = vcookie;
+ kfree(name);
+ return 0;
}
-static enum
-fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
- const void *buffer,
- uint16_t buflen,
- loff_t object_size)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
-
- if (buflen != sizeof(v9inode->qid.version))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- if (memcmp(buffer, &v9inode->qid.version,
- sizeof(v9inode->qid.version)))
- return FSCACHE_CHECKAUX_OBSOLETE;
-
- return FSCACHE_CHECKAUX_OKAY;
-}
-
-const struct fscache_cookie_def v9fs_cache_inode_index_def = {
- .name = "9p.inode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = v9fs_cache_inode_check_aux,
-};
-
void v9fs_cache_inode_get_cookie(struct inode *inode)
{
- struct v9fs_inode *v9inode;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
+ __le32 version;
+ __le64 path;
if (!S_ISREG(inode->i_mode))
return;
-
- v9inode = V9FS_I(inode);
- if (v9inode->fscache)
+ if (WARN_ON(v9fs_inode_cookie(v9inode)))
return;
+ version = cpu_to_le32(v9inode->qid.version);
+ path = cpu_to_le64(v9inode->qid.path);
v9ses = v9fs_inode2v9ses(inode);
- v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
- &v9fs_cache_inode_index_def,
- &v9inode->qid.path,
- sizeof(v9inode->qid.path),
- &v9inode->qid.version,
- sizeof(v9inode->qid.version),
- v9inode,
- i_size_read(&v9inode->vfs_inode),
- true);
+ v9inode->netfs.cache =
+ fscache_acquire_cookie(v9fs_session_cache(v9ses),
+ 0,
+ &path, sizeof(path),
+ &version, sizeof(version),
+ i_size_read(&v9inode->netfs.inode));
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
- inode, v9inode->fscache);
-}
-
-void v9fs_cache_inode_put_cookie(struct inode *inode)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- if (!v9inode->fscache)
- return;
- p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
- inode, v9inode->fscache);
-
- fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version,
- false);
- v9inode->fscache = NULL;
-}
-
-void v9fs_cache_inode_flush_cookie(struct inode *inode)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- if (!v9inode->fscache)
- return;
- p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
- inode, v9inode->fscache);
-
- fscache_relinquish_cookie(v9inode->fscache, NULL, true);
- v9inode->fscache = NULL;
-}
-
-void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- if (!v9inode->fscache)
- return;
-
- mutex_lock(&v9inode->fscache_lock);
-
- if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
- v9fs_cache_inode_flush_cookie(inode);
- else
- v9fs_cache_inode_get_cookie(inode);
-
- mutex_unlock(&v9inode->fscache_lock);
-}
-
-void v9fs_cache_inode_reset_cookie(struct inode *inode)
-{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct v9fs_session_info *v9ses;
- struct fscache_cookie *old;
-
- if (!v9inode->fscache)
- return;
-
- old = v9inode->fscache;
-
- mutex_lock(&v9inode->fscache_lock);
- fscache_relinquish_cookie(v9inode->fscache, NULL, true);
-
- v9ses = v9fs_inode2v9ses(inode);
- v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
- &v9fs_cache_inode_index_def,
- &v9inode->qid.path,
- sizeof(v9inode->qid.path),
- &v9inode->qid.version,
- sizeof(v9inode->qid.version),
- v9inode,
- i_size_read(&v9inode->vfs_inode),
- true);
- p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
- inode, old, v9inode->fscache);
-
- mutex_unlock(&v9inode->fscache_lock);
+ inode, v9fs_inode_cookie(v9inode));
}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 7480b4b49fea..1923affcdc62 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -7,26 +7,15 @@
#ifndef _9P_CACHE_H
#define _9P_CACHE_H
-#define FSCACHE_USE_NEW_IO_API
+
#include <linux/fscache.h>
#ifdef CONFIG_9P_FSCACHE
-extern struct fscache_netfs v9fs_cache_netfs;
-extern const struct fscache_cookie_def v9fs_cache_session_index_def;
-extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
-
-extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
-extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
+extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
+ const char *dev_name);
extern void v9fs_cache_inode_get_cookie(struct inode *inode);
-extern void v9fs_cache_inode_put_cookie(struct inode *inode);
-extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
-extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
-extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
-
-extern int __v9fs_cache_register(void);
-extern void __v9fs_cache_unregister(void);
#else /* CONFIG_9P_FSCACHE */
@@ -34,13 +23,5 @@ static inline void v9fs_cache_inode_get_cookie(struct inode *inode)
{
}
-static inline void v9fs_cache_inode_put_cookie(struct inode *inode)
-{
-}
-
-static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file)
-{
-}
-
#endif /* CONFIG_9P_FSCACHE */
#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6aab046c98e2..baf2b152229e 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -96,12 +96,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
dentry, dentry, from_kuid(&init_user_ns, uid),
any);
ret = NULL;
-
- if (d_inode(dentry))
- ret = v9fs_fid_find_inode(d_inode(dentry), uid);
-
/* we'll recheck under lock if there's anything to look in */
- if (!ret && dentry->d_fsdata) {
+ if (dentry->d_fsdata) {
struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
spin_lock(&dentry->d_lock);
@@ -113,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
}
}
spin_unlock(&dentry->d_lock);
+ } else {
+ if (dentry->d_inode)
+ ret = v9fs_fid_find_inode(dentry->d_inode, uid);
}
return ret;
@@ -153,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
const unsigned char **wnames, *uname;
int i, n, l, clone, access;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *old_fid = NULL;
+ struct p9_fid *fid, *old_fid;
v9ses = v9fs_dentry2v9ses(dentry);
access = v9ses->flags & V9FS_ACCESS_MASK;
@@ -195,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
if (IS_ERR(fid))
return fid;
+ refcount_inc(&fid->count);
v9fs_fid_add(dentry->d_sb->s_root, fid);
}
/* If we are root ourself just return that */
- if (dentry->d_sb->s_root == dentry) {
- refcount_inc(&fid->count);
+ if (dentry->d_sb->s_root == dentry)
return fid;
- }
/*
* Do a multipath walk with attached root.
* When walking parent we need to make sure we
@@ -213,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = ERR_PTR(n);
goto err_out;
}
+ old_fid = fid;
clone = 1;
i = 0;
while (i < n) {
@@ -222,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
* walk to ensure none of the patch component change
*/
fid = p9_client_walk(fid, l, &wnames[i], clone);
+ /* non-cloning walk will return the same fid */
+ if (fid != old_fid) {
+ p9_client_clunk(old_fid);
+ old_fid = fid;
+ }
if (IS_ERR(fid)) {
- if (old_fid) {
- /*
- * If we fail, clunk fid which are mapping
- * to path component and not the last component
- * of the path.
- */
- p9_client_clunk(old_fid);
- }
kfree(wnames);
goto err_out;
}
- old_fid = fid;
i += l;
clone = 0;
}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index e32dd5f7721b..0129de2ea31a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -469,7 +469,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
- v9fs_cache_session_get_cookie(v9ses);
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
+ if (rc < 0)
+ goto err_clnt;
+ }
#endif
spin_lock(&v9fs_sessionlist_lock);
list_add(&v9ses->slist, &v9fs_sessionlist);
@@ -502,8 +506,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
}
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->fscache)
- v9fs_cache_session_put_cookie(v9ses);
+ fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false);
kfree(v9ses->cachetag);
#endif
kfree(v9ses->uname);
@@ -620,11 +623,9 @@ static void v9fs_sysfs_cleanup(void)
static void v9fs_inode_init_once(void *foo)
{
struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
-#ifdef CONFIG_9P_FSCACHE
- v9inode->fscache = NULL;
-#endif
+
memset(&v9inode->qid, 0, sizeof(v9inode->qid));
- inode_init_once(&v9inode->vfs_inode);
+ inode_init_once(&v9inode->netfs.inode);
}
/**
@@ -665,20 +666,12 @@ static int v9fs_cache_register(void)
ret = v9fs_init_inode_cache();
if (ret < 0)
return ret;
-#ifdef CONFIG_9P_FSCACHE
- ret = fscache_register_netfs(&v9fs_cache_netfs);
- if (ret < 0)
- v9fs_destroy_inode_cache();
-#endif
return ret;
}
static void v9fs_cache_unregister(void)
{
v9fs_destroy_inode_cache();
-#ifdef CONFIG_9P_FSCACHE
- fscache_unregister_netfs(&v9fs_cache_netfs);
-#endif
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 1647a8e63671..6acabc2e7dc9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -9,6 +9,7 @@
#define FS_9P_V9FS_H
#include <linux/backing-dev.h>
+#include <linux/netfs.h>
/**
* enum p9_session_flags - option flags for each 9P session
@@ -89,7 +90,7 @@ struct v9fs_session_info {
unsigned int cache;
#ifdef CONFIG_9P_FSCACHE
char *cachetag;
- struct fscache_cookie *fscache;
+ struct fscache_volume *fscache;
#endif
char *uname; /* user name to mount as */
@@ -108,31 +109,37 @@ struct v9fs_session_info {
#define V9FS_INO_INVALID_ATTR 0x01
struct v9fs_inode {
-#ifdef CONFIG_9P_FSCACHE
- struct mutex fscache_lock;
- struct fscache_cookie *fscache;
-#endif
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct p9_qid qid;
unsigned int cache_validity;
struct p9_fid *writeback_fid;
struct mutex v_mutex;
- struct inode vfs_inode;
};
static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
{
- return container_of(inode, struct v9fs_inode, vfs_inode);
+ return container_of(inode, struct v9fs_inode, netfs.inode);
}
static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
{
#ifdef CONFIG_9P_FSCACHE
- return v9inode->fscache;
+ return netfs_i_cookie(&v9inode->netfs);
+#else
+ return NULL;
+#endif
+}
+
+static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info *v9ses)
+{
+#ifdef CONFIG_9P_FSCACHE
+ return v9ses->fscache;
#else
return NULL;
#endif
}
+
extern int v9fs_show_options(struct seq_file *m, struct dentry *root);
struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
@@ -154,6 +161,7 @@ extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
+extern const struct netfs_request_ops v9fs_req_ops;
extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
struct p9_fid *fid,
struct super_block *sb, int new);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index fac918ccb305..d0833fa69faf 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
+#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/netfs.h>
#include <net/9p/9p.h>
@@ -27,12 +28,12 @@
#include "fid.h"
/**
- * v9fs_req_issue_op - Issue a read from 9P
+ * v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
*/
-static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq)
+static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct netfs_read_request *rreq = subreq->rreq;
+ struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
struct iov_iter to;
loff_t pos = subreq->start + subreq->transferred;
@@ -42,129 +43,124 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq)
iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
total = p9_client_read(fid, pos, &to, &err);
+
+ /* if we just extended the file size, any portion not in
+ * cache won't be on server and is zeroes */
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+
netfs_subreq_terminated(subreq, err ?: total, false);
}
/**
- * v9fs_init_rreq - Initialise a read request
+ * v9fs_init_request - Initialise a read request
* @rreq: The read request
* @file: The file being read from
*/
-static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
+ struct inode *inode = file_inode(file);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = file->private_data;
+ BUG_ON(!fid);
+
+ /* we might need to read from a fid that was opened write-only
+ * for read-modify-write of page cache, use the writeback fid
+ * for that */
+ if (rreq->origin == NETFS_READ_FOR_WRITE &&
+ (fid->mode & O_ACCMODE) == O_WRONLY) {
+ fid = v9inode->writeback_fid;
+ BUG_ON(!fid);
+ }
+
refcount_inc(&fid->count);
rreq->netfs_priv = fid;
+ return 0;
}
/**
- * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq
- * @mapping: unused mapping of request to cleanup
- * @priv: private data to cleanup, a fid, guaranted non-null.
+ * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq
+ * @rreq: The I/O request to clean up
*/
-static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
+static void v9fs_free_request(struct netfs_io_request *rreq)
{
- struct p9_fid *fid = priv;
+ struct p9_fid *fid = rreq->netfs_priv;
p9_client_clunk(fid);
}
/**
- * v9fs_is_cache_enabled - Determine if caching is enabled for an inode
- * @inode: The inode to check
- */
-static bool v9fs_is_cache_enabled(struct inode *inode)
-{
- struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode));
-
- return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
-}
-
-/**
* v9fs_begin_cache_operation - Begin a cache operation for a read
* @rreq: The read request
*/
-static int v9fs_begin_cache_operation(struct netfs_read_request *rreq)
+static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
{
+#ifdef CONFIG_9P_FSCACHE
struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
- return fscache_begin_read_operation(rreq, cookie);
+ return fscache_begin_read_operation(&rreq->cache_resources, cookie);
+#else
+ return -ENOBUFS;
+#endif
}
-static const struct netfs_read_request_ops v9fs_req_ops = {
- .init_rreq = v9fs_init_rreq,
- .is_cache_enabled = v9fs_is_cache_enabled,
+const struct netfs_request_ops v9fs_req_ops = {
+ .init_request = v9fs_init_request,
+ .free_request = v9fs_free_request,
.begin_cache_operation = v9fs_begin_cache_operation,
- .issue_op = v9fs_req_issue_op,
- .cleanup = v9fs_req_cleanup,
+ .issue_read = v9fs_issue_read,
};
/**
- * v9fs_vfs_readpage - read an entire page in from 9P
- * @file: file being read
- * @page: structure to page
- *
- */
-static int v9fs_vfs_readpage(struct file *file, struct page *page)
-{
- struct folio *folio = page_folio(page);
-
- return netfs_readpage(file, folio, &v9fs_req_ops, NULL);
-}
-
-/**
- * v9fs_vfs_readahead - read a set of pages from 9P
- * @ractl: The readahead parameters
- */
-static void v9fs_vfs_readahead(struct readahead_control *ractl)
-{
- netfs_readahead(ractl, &v9fs_req_ops, NULL);
-}
-
-/**
- * v9fs_release_page - release the private state associated with a page
- * @page: The page to be released
+ * v9fs_release_folio - release the private state associated with a folio
+ * @folio: The folio to be released
* @gfp: The caller's allocation restrictions
*
- * Returns 1 if the page can be released, false otherwise.
+ * Returns true if the page can be released, false otherwise.
*/
-static int v9fs_release_page(struct page *page, gfp_t gfp)
+static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct folio *folio = page_folio(page);
+ struct inode *inode = folio_inode(folio);
if (folio_test_private(folio))
- return 0;
+ return false;
#ifdef CONFIG_9P_FSCACHE
if (folio_test_fscache(folio)) {
- if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
- return 0;
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
folio_wait_fscache(folio);
}
#endif
- return 1;
+ fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode)));
+ return true;
}
-/**
- * v9fs_invalidate_page - Invalidate a page completely or partially
- * @page: The page to be invalidated
- * @offset: offset of the invalidated region
- * @length: length of the invalidated region
- */
-
-static void v9fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(page);
-
folio_wait_fscache(folio);
}
+static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct v9fs_inode *v9inode = priv;
+ __le32 version;
+
+ if (IS_ERR_VALUE(transferred_or_error) &&
+ transferred_or_error != -ENOBUFS) {
+ version = cpu_to_le32(v9inode->qid.version);
+ fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
+ i_size_read(&v9inode->netfs.inode), 0);
+ }
+}
+
static int v9fs_vfs_write_folio_locked(struct folio *folio)
{
struct inode *inode = folio_inode(folio);
struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode);
loff_t start = folio_pos(folio);
loff_t i_size = i_size_read(inode);
struct iov_iter from;
@@ -181,10 +177,21 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
+ folio_wait_fscache(folio);
folio_start_writeback(folio);
p9_client_write(v9inode->writeback_fid, start, &from, &err);
+ if (err == 0 &&
+ fscache_cookie_enabled(cookie) &&
+ test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
+ folio_start_fscache(folio);
+ fscache_write_to_cache(v9fs_inode_cookie(v9inode),
+ folio_mapping(folio), start, len, i_size,
+ v9fs_write_to_cache_done, v9inode,
+ true);
+ }
+
folio_end_writeback(folio);
return err;
}
@@ -211,16 +218,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
return retval;
}
-/**
- * v9fs_launder_page - Writeback a dirty page
- * @page: The page to be cleaned up
- *
- * Returns 0 on success.
- */
-
-static int v9fs_launder_page(struct page *page)
+static int v9fs_launder_folio(struct folio *folio)
{
- struct folio *folio = page_folio(page);
int retval;
if (folio_clear_dirty_for_io(folio)) {
@@ -272,7 +271,7 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
+ loff_t pos, unsigned int len,
struct page **subpagep, void **fsdata)
{
int retval;
@@ -287,8 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata,
- &v9fs_req_ops, NULL);
+ retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
if (retval < 0)
return retval;
@@ -303,6 +301,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
loff_t last_pos = pos + copied;
struct folio *folio = page_folio(subpage);
struct inode *inode = mapping->host;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
@@ -322,6 +321,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
if (last_pos > inode->i_size) {
inode_add_bytes(inode, last_pos - inode->i_size);
i_size_write(inode, last_pos);
+ fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos);
}
folio_mark_dirty(folio);
out:
@@ -331,16 +331,30 @@ out:
return copied;
}
+#ifdef CONFIG_9P_FSCACHE
+/*
+ * Mark a page as having been made dirty and thus needing writeback. We also
+ * need to pin the cache object to write back to.
+ */
+static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
+
+ return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
+}
+#else
+#define v9fs_dirty_folio filemap_dirty_folio
+#endif
const struct address_space_operations v9fs_addr_operations = {
- .readpage = v9fs_vfs_readpage,
- .readahead = v9fs_vfs_readahead,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = v9fs_dirty_folio,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
.write_end = v9fs_write_end,
- .releasepage = v9fs_release_page,
- .invalidatepage = v9fs_invalidate_page,
- .launder_page = v9fs_launder_page,
+ .release_folio = v9fs_release_folio,
+ .invalidate_folio = v9fs_invalidate_folio,
+ .launder_folio = v9fs_launder_folio,
.direct_IO = v9fs_direct_IO,
};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 8c854d8cb0cd..958680f7f23e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -17,6 +17,7 @@
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/uio.h>
+#include <linux/fscache.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -205,7 +206,10 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
int v9fs_dir_release(struct inode *inode, struct file *filp)
{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid;
+ __le32 version;
+ loff_t i_size;
fid = filp->private_data;
p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
@@ -216,6 +220,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_unlock(&inode->i_lock);
p9_client_clunk(fid);
}
+
+ if ((filp->f_mode & FMODE_WRITE)) {
+ version = cpu_to_le32(v9inode->qid.version);
+ i_size = i_size_read(inode);
+ fscache_unuse_cookie(v9fs_inode_cookie(v9inode),
+ &version, &i_size);
+ } else {
+ fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL);
+ }
return 0;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 612e297f3763..2573c08f335c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -93,7 +93,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
}
mutex_unlock(&v9inode->v_mutex);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(inode, file);
+ fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ file->f_mode & FMODE_WRITE);
v9fs_open_fid_add(inode, fid);
return 0;
out_error:
@@ -114,7 +115,6 @@ out_error:
static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- int res = 0;
struct inode *inode = file_inode(filp);
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
@@ -124,7 +124,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
- return res;
+ return 0;
}
static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
@@ -139,8 +139,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
fid = filp->private_data;
BUG_ON(fid == NULL);
- if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
- BUG();
+ BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX);
res = locks_lock_file_wait(filp, fl);
if (res < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 328c338ff304..3d8297714772 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -228,17 +228,13 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
- v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
+ v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
-#ifdef CONFIG_9P_FSCACHE
- v9inode->fscache = NULL;
- mutex_init(&v9inode->fscache_lock);
-#endif
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
- return &v9inode->vfs_inode;
+ return &v9inode->netfs.inode;
}
/**
@@ -251,6 +247,15 @@ void v9fs_free_inode(struct inode *inode)
kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
}
+/*
+ * Set parameters for the netfs library
+ */
+static void v9fs_set_netfs_context(struct inode *inode)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
+}
+
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev)
{
@@ -339,6 +344,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
+
+ v9fs_set_netfs_context(inode);
error:
return err;
@@ -381,12 +388,16 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
+ __le32 version;
truncate_inode_pages_final(&inode->i_data);
+ version = cpu_to_le32(v9inode->qid.version);
+ fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
+ &version);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
- v9fs_cache_inode_put_cookie(inode);
+ fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
/* clunk the fid stashed in writeback_fid */
if (v9inode->writeback_fid) {
p9_client_clunk(v9inode->writeback_fid);
@@ -869,7 +880,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
file->private_data = fid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(d_inode(dentry), file);
+ fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ file->f_mode & FMODE_WRITE);
v9fs_open_fid_add(inode, fid);
file->f_mode |= FMODE_CREATED;
@@ -1072,6 +1084,8 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *iattr)
{
int retval, use_dentry = 0;
+ struct inode *inode = d_inode(dentry);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL;
struct p9_wstat wstat;
@@ -1117,7 +1131,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
/* Write all dirty data */
if (d_is_reg(dentry))
- filemap_write_and_wait(d_inode(dentry)->i_mapping);
+ filemap_write_and_wait(inode->i_mapping);
retval = p9_client_wstat(fid, &wstat);
@@ -1128,13 +1142,15 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
return retval;
if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(d_inode(dentry)))
- truncate_setsize(d_inode(dentry), iattr->ia_size);
+ iattr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, iattr->ia_size);
+ fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size);
+ }
- v9fs_invalidate_inode_attr(d_inode(dentry));
+ v9fs_invalidate_inode_attr(inode);
- setattr_copy(&init_user_ns, d_inode(dentry), iattr);
- mark_inode_dirty(d_inode(dentry));
+ setattr_copy(&init_user_ns, inode, iattr);
+ mark_inode_dirty(inode);
return 0;
}
@@ -1235,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
+ if (!v9fs_proto_dotu(v9ses))
+ return ERR_PTR(-EBADF);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
- if (!v9fs_proto_dotu(v9ses))
- return ERR_PTR(-EBADF);
-
st = p9_client_stat(fid);
p9_client_clunk(fid);
if (IS_ERR(st))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 7dee89ba32e7..b6eb1160296c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_client_clunk(dfid);
goto out;
}
@@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err) {
p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
err);
+ p9_client_clunk(dfid);
goto error;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
@@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
err);
+ p9_client_clunk(dfid);
goto error;
}
v9fs_invalidate_inode_attr(dir);
@@ -344,7 +347,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
goto err_clunk_old_fid;
file->private_data = ofid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(inode, file);
+ fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ file->f_mode & FMODE_WRITE);
v9fs_open_fid_add(inode, ofid);
file->f_mode |= FMODE_CREATED;
out:
@@ -551,7 +555,10 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
{
int retval, use_dentry = 0;
struct p9_fid *fid = NULL;
- struct p9_iattr_dotl p9attr;
+ struct p9_iattr_dotl p9attr = {
+ .uid = INVALID_UID,
+ .gid = INVALID_GID,
+ };
struct inode *inode = d_inode(dentry);
p9_debug(P9_DEBUG_VFS, "\n");
@@ -561,14 +568,22 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
return retval;
p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
- p9attr.mode = iattr->ia_mode;
- p9attr.uid = iattr->ia_uid;
- p9attr.gid = iattr->ia_gid;
- p9attr.size = iattr->ia_size;
- p9attr.atime_sec = iattr->ia_atime.tv_sec;
- p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
- p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
- p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+ if (iattr->ia_valid & ATTR_MODE)
+ p9attr.mode = iattr->ia_mode;
+ if (iattr->ia_valid & ATTR_UID)
+ p9attr.uid = iattr->ia_uid;
+ if (iattr->ia_valid & ATTR_GID)
+ p9attr.gid = iattr->ia_gid;
+ if (iattr->ia_valid & ATTR_SIZE)
+ p9attr.size = iattr->ia_size;
+ if (iattr->ia_valid & ATTR_ATIME_SET) {
+ p9attr.atime_sec = iattr->ia_atime.tv_sec;
+ p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+ }
+ if (iattr->ia_valid & ATTR_MTIME_SET) {
+ p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+ p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+ }
if (iattr->ia_valid & ATTR_FILE) {
fid = iattr->ia_file->private_data;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index b739e02f5ef7..97e23b4e6982 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/magic.h>
+#include <linux/fscache.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -309,6 +310,7 @@ static int v9fs_write_inode(struct inode *inode,
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
+ fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
return 0;
}
@@ -332,6 +334,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
+ fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
return 0;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index a6313a969bc5..5976eb33535f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER
Enable this to perform validation of the parameter description for a
filesystem when it is registered.
-if BLOCK
-
config FS_IOMAP
bool
+if BLOCK
+
source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"
@@ -42,11 +42,13 @@ source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
source "fs/zonefs/Kconfig"
+endif # BLOCK
+
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
- select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
+ depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
help
@@ -89,8 +91,6 @@ config FS_DAX_PMD
config FS_DAX_LIMITED
bool
-endif # BLOCK
-
# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers. Never use
@@ -245,19 +245,27 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
-config HUGETLB_PAGE_FREE_VMEMMAP
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of minimizing overhead of struct page associated with
+# each HugeTLB page.
+#
+config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ bool
+
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on X86_64
+ depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
- bool "Default freeing vmemmap pages of HugeTLB to on"
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+ bool "Default optimizing vmemmap pages of HugeTLB to on"
default n
- depends on HUGETLB_PAGE_FREE_VMEMMAP
+ depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
help
- When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+ When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
pages associated with each HugeTLB page is default off. Say Y here
- to enable freeing vmemmap pages of HugeTLB by default. It can then
+ to enable optimizing vmemmap pages of HugeTLB by default. It can then
be disabled on the command line via hugetlb_free_vmemmap=off.
config MEMFD_CREATE
@@ -344,7 +352,7 @@ config LOCKD
config LOCKD_V4
bool
- depends on NFSD_V3 || NFS_V3
+ depends on NFSD || NFS_V3
depends on FILE_LOCKING
default y
@@ -369,8 +377,8 @@ source "fs/ksmbd/Kconfig"
config SMBFS_COMMON
tristate
- default y if CIFS=y
- default m if CIFS=m
+ default y if CIFS=y || SMB_SERVER=y
+ default m if CIFS=m || SMB_SERVER=m
source "fs/coda/Kconfig"
source "fs/afs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4d5ae61580aa..21e154516bf2 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -28,6 +28,16 @@ config BINFMT_ELF
ld.so (check the file <file:Documentation/Changes> for location and
latest version).
+config BINFMT_ELF_KUNIT_TEST
+ bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y && BINFMT_ELF=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the ELF loader KUnit tests, which try to gather
+ prior bug fixes into a regression test collection. This is really
+ only needed for debugging. Note that with CONFIG_COMPAT=y, the
+ compat_binfmt_elf KUnit test is also created.
+
config COMPAT_BINFMT_ELF
def_bool y
depends on COMPAT && BINFMT_ELF
@@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_BINFMT_ELF_EXTRA_PHDRS
+ bool
+
config ARCH_HAVE_ELF_PROT
bool
@@ -45,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on (ARM || (SUPERH && !MMU))
+ depends on ARM || ((M68K || SUPERH) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
@@ -129,12 +142,6 @@ config BINFMT_ZFLAT
help
Support FLAT format compressed binaries
-config BINFMT_SHARED_FLAT
- bool "Enable shared FLAT support"
- depends on BINFMT_FLAT
- help
- Support FLAT shared libraries
-
config HAVE_AOUT
def_bool n
diff --git a/fs/Makefile b/fs/Makefile
index 84c5e4cdfee5..208a74e0b00e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -6,6 +6,8 @@
# Rewritten to use lists instead of if-statements.
#
+obj-$(CONFIG_SYSCTL) += sysctls.o
+
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o dcache.o inode.o \
@@ -94,7 +96,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/
obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
-obj-$(CONFIG_UNICODE) += unicode/
+obj-y += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/
obj-$(CONFIG_CIFS) += cifs/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index adbb3a1edcbf..ee22278b0cfc 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -38,9 +38,9 @@ static int adfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, adfs_get_block, wbc);
}
-static int adfs_readpage(struct file *file, struct page *page)
+static int adfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, adfs_get_block);
+ return block_read_full_folio(folio, adfs_get_block);
}
static void adfs_write_failed(struct address_space *mapping, loff_t to)
@@ -52,13 +52,13 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
}
static int adfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -73,8 +73,9 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations adfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
- .readpage = adfs_readpage,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
+ .read_folio = adfs_read_folio,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
.write_end = generic_write_end,
@@ -355,7 +356,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct super_block *sb = inode->i_sb;
struct object_info obj;
- int ret;
obj.indaddr = ADFS_I(inode)->indaddr;
obj.name_len = 0;
@@ -365,6 +365,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
obj.attr = ADFS_I(inode)->attr;
obj.size = inode->i_size;
- ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
- return ret;
+ return adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index bdbd26e571ed..e8bfc38239cd 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 75ebd2b576ca..cd00a4c68a12 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -375,9 +375,9 @@ static int affs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, affs_get_block, wbc);
}
-static int affs_readpage(struct file *file, struct page *page)
+static int affs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, affs_get_block);
+ return block_read_full_folio(folio, affs_get_block);
}
static void affs_write_failed(struct address_space *mapping, loff_t to)
@@ -414,13 +414,13 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int affs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -453,8 +453,9 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations affs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
- .readpage = affs_readpage,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
+ .read_folio = affs_read_folio,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
.write_end = affs_write_end,
@@ -628,8 +629,9 @@ out:
}
static int
-affs_readpage_ofs(struct file *file, struct page *page)
+affs_read_folio_ofs(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
u32 to;
int err;
@@ -649,7 +651,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
}
static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -669,7 +671,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -834,8 +836,9 @@ err_bh:
}
const struct address_space_operations affs_aops_ofs = {
- .set_page_dirty = __set_page_dirty_buffers,
- .readpage = affs_readpage_ofs,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
+ .read_folio = affs_read_folio_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
.write_end = affs_write_end_ofs
@@ -885,7 +888,7 @@ affs_truncate(struct inode *inode)
loff_t isize = inode->i_size;
int res;
- res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
+ res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata);
if (!res)
res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
else
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c609005a9eaa..4c5f30a83336 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
{
struct affs_inode_info *i;
- i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL);
if (!i)
return NULL;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index a7531b26e8f0..31d6446dc166 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -11,8 +11,9 @@
#include "affs.h"
-static int affs_symlink_readpage(struct file *file, struct page *page)
+static int affs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
char *link = page_address(page);
@@ -67,7 +68,7 @@ fail:
}
const struct address_space_operations affs_symlink_aops = {
- .readpage = affs_symlink_readpage,
+ .read_folio = affs_symlink_read_folio,
};
const struct inode_operations affs_symlink_inode_operations = {
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 75c4e4043d1d..e8956b65d7ff 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -3,10 +3,7 @@
# Makefile for Red Hat Linux AFS client.
#
-afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
-
kafs-y := \
- $(afs-cache-y) \
addr_list.o \
callback.o \
cell.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
deleted file mode 100644
index 037af93e3aba..000000000000
--- a/fs/afs/cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* AFS caching stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/sched.h>
-#include "internal.h"
-
-static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
- const void *buffer,
- uint16_t buflen,
- loff_t object_size);
-
-struct fscache_netfs afs_cache_netfs = {
- .name = "afs",
- .version = 2,
-};
-
-struct fscache_cookie_def afs_cell_cache_index_def = {
- .name = "AFS.cell",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-struct fscache_cookie_def afs_volume_cache_index_def = {
- .name = "AFS.volume",
- .type = FSCACHE_COOKIE_TYPE_INDEX,
-};
-
-struct fscache_cookie_def afs_vnode_cache_index_def = {
- .name = "AFS.vnode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .check_aux = afs_vnode_cache_check_aux,
-};
-
-/*
- * check that the auxiliary data indicates that the entry is still valid
- */
-static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
- const void *buffer,
- uint16_t buflen,
- loff_t object_size)
-{
- struct afs_vnode *vnode = cookie_netfs_data;
- struct afs_vnode_cache_aux aux;
-
- _enter("{%llx,%x,%llx},%p,%u",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
- buffer, buflen);
-
- memcpy(&aux, buffer, sizeof(aux));
-
- /* check the size of the data is what we're expecting */
- if (buflen != sizeof(aux)) {
- _leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux));
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
- if (vnode->status.data_version != aux.data_version) {
- _leave(" = OBSOLETE [vers %llx != %llx]",
- aux.data_version, vnode->status.data_version);
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
- _leave(" = SUCCESS");
- return FSCACHE_CHECKAUX_OKAY;
-}
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1b4d5809808d..a484fa642808 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work)
{
struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work);
- unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false);
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
void afs_server_init_callback_work(struct work_struct *work)
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index d88407fb9bc0..07ad744eef77 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -680,13 +680,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
return ret;
}
-#ifdef CONFIG_AFS_FSCACHE
- cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
- &afs_cell_cache_index_def,
- cell->name, strlen(cell->name),
- NULL, 0,
- cell, 0, true);
-#endif
ret = afs_proc_cell_setup(cell);
if (ret < 0)
return ret;
@@ -723,11 +716,6 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(cell->cache, NULL, false);
- cell->cache = NULL;
-#endif
-
_leave("");
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index da9b4f8577a1..56ae5cd5184f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -41,11 +41,12 @@ static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
-static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
-static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags);
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
-static int afs_dir_set_page_dirty(struct page *page)
+static bool afs_dir_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
BUG(); /* This should never happen. */
}
@@ -73,9 +74,9 @@ const struct inode_operations afs_dir_inode_operations = {
};
const struct address_space_operations afs_dir_aops = {
- .set_page_dirty = afs_dir_set_page_dirty,
- .releasepage = afs_dir_releasepage,
- .invalidatepage = afs_dir_invalidatepage,
+ .dirty_folio = afs_dir_dirty_folio,
+ .release_folio = afs_dir_release_folio,
+ .invalidate_folio = afs_dir_invalidate_folio,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -108,7 +109,7 @@ struct afs_lookup_cookie {
*/
static void afs_dir_read_cleanup(struct afs_read *req)
{
- struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = req->vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
@@ -152,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
block = kmap_local_folio(folio, offset);
if (block->hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
- __func__, dvnode->vfs_inode.i_ino,
+ __func__, dvnode->netfs.inode.i_ino,
pos, offset, size, ntohs(block->hdr.magic));
trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
kunmap_local(block);
@@ -182,7 +183,7 @@ error:
static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
union afs_xdr_dir_block *block;
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
size_t offset, size;
@@ -216,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
*/
static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
int ret = 0;
@@ -268,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)
static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
__acquires(&dvnode->validate_lock)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct afs_read *req;
loff_t i_size;
int nr_pages, i;
@@ -286,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
req->cleanup = afs_dir_read_cleanup;
expand:
- i_size = i_size_read(&dvnode->vfs_inode);
+ i_size = i_size_read(&dvnode->netfs.inode);
if (i_size < 2048) {
ret = afs_bad(dvnode, afs_file_error_dir_small);
goto error;
@@ -304,7 +305,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
@@ -462,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
}
/* skip if starts before the current position */
- if (offset < curr)
+ if (offset < curr) {
+ if (next > curr)
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
continue;
+ }
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
@@ -893,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
out_op:
if (op->error == 0) {
- inode = &op->file[1].vnode->vfs_inode;
+ inode = &op->file[1].vnode->netfs.inode;
op->file[1].vnode = NULL;
}
@@ -1135,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
afs_stat_v(dir, n_reval);
/* search the directory for this vnode */
- ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version);
+ ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
switch (ret) {
case 0:
/* the filename maps to something */
@@ -1166,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
_debug("%pd: file deleted (uq %u -> %u I:%u)",
dentry, fid.unique,
vnode->fid.unique,
- vnode->vfs_inode.i_generation);
+ vnode->netfs.inode.i_generation);
goto not_found;
}
goto out_valid;
@@ -1364,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -1483,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op)
/* Already done */
} else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
write_seqlock(&vnode->cb_lock);
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_unlink);
}
@@ -1500,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
op->error = ret;
}
- _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1676,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op)
afs_update_dentry_version(op, dvp, op->dentry);
if (op->dentry_2->d_parent == op->dentry->d_parent)
afs_update_dentry_version(op, dvp, op->dentry_2);
- ihold(&vp->vnode->vfs_inode);
- d_instantiate(op->dentry, &vp->vnode->vfs_inode);
+ ihold(&vp->vnode->netfs.inode);
+ d_instantiate(op->dentry, &vp->vnode->netfs.inode);
}
static void afs_link_put(struct afs_operation *op)
@@ -2001,9 +2005,8 @@ error:
* Release a directory folio and clean up its private state if it's not busy
* - return true if the folio can now be released, false if not
*/
-static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
+static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
@@ -2019,13 +2022,12 @@ static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
/*
* Invalidate part or all of a folio.
*/
-static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset,
- unsigned int length)
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{%lu},%u,%u", folio_index(folio), offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
BUG_ON(!folio_test_locked(folio));
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d98e109ecee9..0ab7752d1b75 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
*/
static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
folio = __filemap_get_folio(mapping, index,
@@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -336,7 +336,7 @@ found_space:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] -= need_slots;
- inode_inc_iversion_raw(&vnode->vfs_inode);
+ inode_inc_iversion_raw(&vnode->netfs.inode);
afs_stat_v(vnode, n_dir_cr);
_debug("Insert %s in %u[%u]", name->name, b, slot);
@@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size < AFS_DIR_BLOCK_SIZE ||
i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
@@ -463,7 +463,7 @@ found_dirent:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] += need_slots;
- inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
afs_stat_v(vnode, n_dir_rm);
_debug("Remove %s from %u[%u]", name->name, b, slot);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 45cfd50a9521..bb5807e87fa4 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
goto out;
} while (!d_is_negative(sdentry));
- ihold(&vnode->vfs_inode);
+ ihold(&vnode->netfs.inode);
ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key);
switch (ret) {
@@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
d_drop(sdentry);
}
- iput(&vnode->vfs_inode);
+ iput(&vnode->netfs.inode);
dput(sdentry);
out:
_leave(" = %d", ret);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index db832cc931c8..d7d9402ff718 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,6 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
+ netfs_inode_init(&vnode->netfs, NULL);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index cb6ad61eec3b..42118a4f3383 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -14,17 +14,16 @@
#include <linux/gfp.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/netfs.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_readpage(struct file *file, struct page *page);
-static int afs_symlink_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
-static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_symlink_read_folio(struct file *file, struct folio *folio);
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
+static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
-static void afs_readahead(struct readahead_control *ractl);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static void afs_vm_open(struct vm_area_struct *area);
static void afs_vm_close(struct vm_area_struct *area);
@@ -51,12 +50,12 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
- .readpage = afs_readpage,
- .readahead = afs_readahead,
- .set_page_dirty = afs_set_page_dirty,
- .launder_page = afs_launder_page,
- .releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = afs_dirty_folio,
+ .launder_folio = afs_launder_folio,
+ .release_folio = afs_release_folio,
+ .invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
.writepage = afs_writepage,
@@ -64,9 +63,9 @@ const struct address_space_operations afs_file_aops = {
};
const struct address_space_operations afs_symlink_aops = {
- .readpage = afs_symlink_readpage,
- .releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .read_folio = afs_symlink_read_folio,
+ .release_folio = afs_release_folio,
+ .invalidate_folio = afs_invalidate_folio,
};
static const struct vm_operations_struct afs_vm_ops = {
@@ -158,7 +157,9 @@ int afs_open(struct inode *inode, struct file *file)
if (file->f_flags & O_TRUNC)
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-
+
+ fscache_use_cookie(afs_vnode_cache(vnode), file->f_mode & FMODE_WRITE);
+
file->private_data = af;
_leave(" = 0");
return 0;
@@ -177,8 +178,10 @@ error:
*/
int afs_release(struct inode *inode, struct file *file)
{
+ struct afs_vnode_cache_aux aux;
struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = file->private_data;
+ loff_t i_size;
int ret = 0;
_enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
@@ -189,6 +192,15 @@ int afs_release(struct inode *inode, struct file *file)
file->private_data = NULL;
if (af->wb)
afs_put_wb_key(af->wb);
+
+ if ((file->f_mode & FMODE_WRITE)) {
+ i_size = i_size_read(&vnode->netfs.inode);
+ afs_set_cache_aux(vnode, &aux);
+ fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size);
+ } else {
+ fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL);
+ }
+
key_put(af->key);
kfree(af);
afs_prune_wb_keys(vnode);
@@ -226,7 +238,7 @@ void afs_put_read(struct afs_read *req)
static void afs_fetch_data_notify(struct afs_operation *op)
{
struct afs_read *req = op->fetch.req;
- struct netfs_read_subrequest *subreq = req->subreq;
+ struct netfs_io_subrequest *subreq = req->subreq;
int error = op->error;
if (error == -ECONNABORTED)
@@ -296,7 +308,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
return afs_do_sync_operation(op);
}
-static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
+static void afs_issue_read(struct netfs_io_subrequest *subreq)
{
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct afs_read *fsreq;
@@ -313,18 +325,17 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
fsreq->iter = &fsreq->def_iter;
iov_iter_xarray(&fsreq->def_iter, READ,
- &fsreq->vnode->vfs_inode.i_mapping->i_pages,
+ &fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
}
-static int afs_symlink_readpage(struct file *file, struct page *page)
+static int afs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
struct afs_read *fsreq;
- struct folio *folio = page_folio(page);
int ret;
fsreq = afs_alloc_read(GFP_NOFS);
@@ -335,33 +346,32 @@ static int afs_symlink_readpage(struct file *file, struct page *page)
fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
+ iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
if (ret == 0)
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return ret;
}
-static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file)
+static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
rreq->netfs_priv = key_get(afs_file_key(file));
+ return 0;
}
-static bool afs_is_cache_enabled(struct inode *inode)
-{
- struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode));
-
- return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
-}
-
-static int afs_begin_cache_operation(struct netfs_read_request *rreq)
+static int afs_begin_cache_operation(struct netfs_io_request *rreq)
{
+#ifdef CONFIG_AFS_FSCACHE
struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
- return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode));
+ return fscache_begin_read_operation(&rreq->cache_resources,
+ afs_vnode_cache(vnode));
+#else
+ return -ENOBUFS;
+#endif
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
@@ -372,38 +382,31 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
}
-static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
+static void afs_free_request(struct netfs_io_request *rreq)
{
- key_put(netfs_priv);
+ key_put(rreq->netfs_priv);
}
-const struct netfs_read_request_ops afs_req_ops = {
- .init_rreq = afs_init_rreq,
- .is_cache_enabled = afs_is_cache_enabled,
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
+ .free_request = afs_free_request,
.begin_cache_operation = afs_begin_cache_operation,
.check_write_begin = afs_check_write_begin,
- .issue_op = afs_req_issue_op,
- .cleanup = afs_priv_cleanup,
+ .issue_read = afs_issue_read,
};
-static int afs_readpage(struct file *file, struct page *page)
+int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
-
- return netfs_readpage(file, folio, &afs_req_ops, NULL);
-}
-
-static void afs_readahead(struct readahead_control *ractl)
-{
- netfs_readahead(ractl, &afs_req_ops, NULL);
+ fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
+ return 0;
}
/*
* Adjust the dirty region of the page on truncation or full invalidation,
* getting rid of the markers altogether if the region is entirely invalidated.
*/
-static void afs_invalidate_dirty(struct folio *folio, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_dirty(struct folio *folio, size_t offset,
+ size_t length)
{
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
unsigned long priv;
@@ -460,16 +463,14 @@ full_invalidate:
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(page);
-
- _enter("{%lu},%u,%u", folio_index(folio), offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (PagePrivate(page))
+ if (folio_get_private(folio))
afs_invalidate_dirty(folio, offset, length);
folio_wait_fscache(folio);
@@ -480,23 +481,23 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
* release a page and clean up its private state if it's not busy
* - return true if the page can now be released, false if not
*/
-static int afs_releasepage(struct page *page, gfp_t gfp_flags)
+static bool afs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct folio *folio = page_folio(page);
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
- gfp_flags);
+ gfp);
- /* deny if page is being written to the cache and the caller hasn't
+ /* deny if folio is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
if (folio_test_fscache(folio)) {
- if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS))
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
folio_wait_fscache(folio);
}
+ fscache_note_page_release(afs_vnode_cache(vnode));
#endif
if (folio_test_private(folio)) {
@@ -514,8 +515,9 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
down_write(&vnode->volume->cell->fs_open_mmaps_lock);
- list_add_tail(&vnode->cb_mmap_link,
- &vnode->volume->cell->fs_open_mmaps);
+ if (list_empty(&vnode->cb_mmap_link))
+ list_add_tail(&vnode->cb_mmap_link,
+ &vnode->volume->cell->fs_open_mmaps);
up_write(&vnode->volume->cell->fs_open_mmaps_lock);
}
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index d222dfbe976b..7a3803ce3a22 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op)
if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
if (op->file[0].put_vnode)
- iput(&op->file[0].vnode->vfs_inode);
+ iput(&op->file[0].vnode->netfs.inode);
if (op->file[1].put_vnode)
- iput(&op->file[1].vnode->vfs_inode);
+ iput(&op->file[1].vnode->netfs.inode);
if (op->more_files) {
for (i = 0; i < op->nr_files - 2; i++)
if (op->more_files[i].put_vnode)
- iput(&op->more_files[i].vnode->vfs_inode);
+ iput(&op->more_files[i].vnode->netfs.inode);
kfree(op->more_files);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 16906eb592d9..64dab70d4a4f 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -54,6 +54,14 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
}
/*
+ * Set parameters for the netfs library
+ */
+static void afs_set_netfs_context(struct afs_vnode *vnode)
+{
+ netfs_inode_init(&vnode->netfs, &afs_req_ops);
+}
+
+/*
* Initialise an inode from the vnode status.
*/
static int afs_inode_init_from_status(struct afs_operation *op,
@@ -88,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_flags |= S_NOATIME;
inode->i_uid = make_kuid(&init_user_ns, status->owner);
inode->i_gid = make_kgid(&init_user_ns, status->group);
- set_nlink(&vnode->vfs_inode, status->nlink);
+ set_nlink(&vnode->netfs.inode, status->nlink);
switch (status->type) {
case AFS_FTYPE_FILE:
@@ -128,9 +136,10 @@ static int afs_inode_init_from_status(struct afs_operation *op,
}
afs_set_i_size(vnode, status->size);
+ afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
- inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
@@ -154,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
bool data_changed = false;
@@ -237,6 +246,7 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
+ vnode->netfs.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
inode->i_ctime = t;
@@ -279,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
*/
if (vp->scb.status.abort_code == VNOVNODE) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
@@ -296,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
} else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
}
@@ -316,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->vfs_inode.i_state & I_NEW) {
+ if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
op->error = ret;
if (ret == 0)
@@ -413,28 +423,31 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
struct {
- u32 vnode_id;
- u32 unique;
- u32 vnode_id_ext[2]; /* Allow for a 96-bit key */
+ __be32 vnode_id;
+ __be32 unique;
+ __be32 vnode_id_ext[2]; /* Allow for a 96-bit key */
} __packed key;
struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) {
- vnode->cache = NULL;
+ vnode->netfs.cache = NULL;
return;
}
- key.vnode_id = vnode->fid.vnode;
- key.unique = vnode->fid.unique;
- key.vnode_id_ext[0] = vnode->fid.vnode >> 32;
- key.vnode_id_ext[1] = vnode->fid.vnode_hi;
- aux.data_version = vnode->status.data_version;
-
- vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
- &afs_vnode_cache_index_def,
- &key, sizeof(key),
- &aux, sizeof(aux),
- vnode, vnode->status.size, true);
+ key.vnode_id = htonl(vnode->fid.vnode);
+ key.unique = htonl(vnode->fid.unique);
+ key.vnode_id_ext[0] = htonl(vnode->fid.vnode >> 32);
+ key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi);
+ afs_set_cache_aux(vnode, &aux);
+
+ afs_vnode_set_cache(vnode,
+ fscache_acquire_cookie(
+ vnode->volume->cache,
+ vnode->status.type == AFS_FTYPE_FILE ?
+ 0 : FSCACHE_ADV_SINGLE_CHUNK,
+ &key, sizeof(key),
+ &aux, sizeof(aux),
+ vnode->status.size));
#endif
}
@@ -444,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
{
struct afs_vnode_param *dvp = &op->file[0];
- struct super_block *sb = dvp->vnode->vfs_inode.i_sb;
+ struct super_block *sb = dvp->vnode->netfs.inode.i_sb;
struct afs_vnode *vnode;
struct inode *inode;
int ret;
@@ -527,6 +540,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
vnode = AFS_FS_I(inode);
vnode->cb_v_break = as->volume->cb_v_break,
+ afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
if (IS_ERR(op)) {
@@ -563,17 +577,15 @@ static void afs_zap_data(struct afs_vnode *vnode)
{
_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_invalidate(vnode->cache);
-#endif
+ afs_invalidate_cache(vnode, 0);
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
* directory or symlink */
- if (S_ISREG(vnode->vfs_inode.i_mode))
- invalidate_remote_inode(&vnode->vfs_inode);
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
else
- invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
}
/*
@@ -671,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
key_serial(key));
if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->vfs_inode.i_nlink)
- clear_nlink(&vnode->vfs_inode);
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
goto valid;
}
@@ -728,10 +740,23 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
- int seq = 0;
+ struct key *key;
+ int ret, seq = 0;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+ if (vnode->volume &&
+ !(query_flags & AT_STATX_DONT_SYNC) &&
+ !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ ret = afs_validate(vnode, key);
+ key_put(key);
+ if (ret < 0)
+ return ret;
+ }
+
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(&init_user_ns, inode, stat);
@@ -762,9 +787,8 @@ int afs_drop_inode(struct inode *inode)
*/
void afs_evict_inode(struct inode *inode)
{
- struct afs_vnode *vnode;
-
- vnode = AFS_FS_I(inode);
+ struct afs_vnode_cache_aux aux;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
_enter("{%llx:%llu.%d}",
vnode->fid.vid,
@@ -776,6 +800,9 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
truncate_inode_pages_final(&inode->i_data);
+
+ afs_set_cache_aux(vnode, &aux);
+ fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
@@ -785,16 +812,8 @@ void afs_evict_inode(struct inode *inode)
afs_put_wb_key(wbk);
}
-#ifdef CONFIG_AFS_FSCACHE
- {
- struct afs_vnode_cache_aux aux;
-
- aux.data_version = vnode->status.data_version;
- fscache_relinquish_cookie(vnode->cache, &aux,
- test_bit(AFS_VNODE_DELETED, &vnode->flags));
- vnode->cache = NULL;
- }
-#endif
+ fscache_relinquish_cookie(afs_vnode_cache(vnode),
+ test_bit(AFS_VNODE_DELETED, &vnode->flags));
afs_prune_wb_keys(vnode);
afs_put_permits(rcu_access_pointer(vnode->permit_cache));
@@ -808,7 +827,7 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
loff_t old_i_size = i_size_read(inode);
op->setattr.old_i_size = old_i_size;
@@ -825,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
@@ -833,6 +852,9 @@ static void afs_setattr_edit_file(struct afs_operation *op)
if (size < i_size)
truncate_pagecache(inode, size);
+ if (size != i_size)
+ fscache_resize_cookie(afs_vnode_cache(vp->vnode),
+ vp->scb.status.size);
}
}
@@ -849,40 +871,67 @@ static const struct afs_operation_ops afs_setattr_operation = {
int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
{
+ const unsigned int supported =
+ ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
+ ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH;
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+ struct inode *inode = &vnode->netfs.inode;
+ loff_t i_size;
int ret;
_enter("{%llx:%llu},{n=%pd},%x",
vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
- if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
- ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET |
- ATTR_TOUCH))) {
+ if (!(attr->ia_valid & supported)) {
_leave(" = 0 [unsupported]");
return 0;
}
+ i_size = i_size_read(inode);
if (attr->ia_valid & ATTR_SIZE) {
- if (!S_ISREG(vnode->vfs_inode.i_mode))
+ if (!S_ISREG(inode->i_mode))
return -EISDIR;
- ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size);
+ ret = inode_newsize_ok(inode, attr->ia_size);
if (ret)
return ret;
- if (attr->ia_size == i_size_read(&vnode->vfs_inode))
+ if (attr->ia_size == i_size)
attr->ia_valid &= ~ATTR_SIZE;
}
- /* flush any dirty data outstanding on a regular file */
- if (S_ISREG(vnode->vfs_inode.i_mode))
- filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ fscache_use_cookie(afs_vnode_cache(vnode), true);
/* Prevent any new writebacks from starting whilst we do this. */
down_write(&vnode->validate_lock);
+ if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode)) {
+ loff_t size = attr->ia_size;
+
+ /* Wait for any outstanding writes to the server to complete */
+ loff_t from = min(size, i_size);
+ loff_t to = max(size, i_size);
+ ret = filemap_fdatawait_range(inode->i_mapping, from, to);
+ if (ret < 0)
+ goto out_unlock;
+
+ /* Don't talk to the server if we're just shortening in-memory
+ * writes that haven't gone to the server yet.
+ */
+ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
+ attr->ia_size < i_size &&
+ attr->ia_size > vnode->status.size) {
+ truncate_pagecache(inode, attr->ia_size);
+ fscache_resize_cookie(afs_vnode_cache(vnode),
+ attr->ia_size);
+ i_size_write(inode, attr->ia_size);
+ ret = 0;
+ goto out_unlock;
+ }
+ }
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
@@ -907,6 +956,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
out_unlock:
up_write(&vnode->validate_lock);
+ fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index aa4c0d6c9780..a6f25d9e75b5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -14,7 +14,6 @@
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
-#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
#include <linux/backing-dev.h>
#include <linux/uuid.h>
@@ -208,7 +207,7 @@ struct afs_read {
loff_t file_size; /* File size returned by server */
struct key *key; /* The key to use to reissue the read */
struct afs_vnode *vnode; /* The file being read into. */
- struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */
+ struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */
afs_dataversion_t data_version; /* Version number returned by server */
refcount_t usage;
unsigned int call_debug_id;
@@ -312,7 +311,7 @@ struct afs_net {
atomic_t n_lookup; /* Number of lookups done */
atomic_t n_reval; /* Number of dentries needing revalidation */
atomic_t n_inval; /* Number of invalidations by the server */
- atomic_t n_relpg; /* Number of invalidations by releasepage */
+ atomic_t n_relpg; /* Number of invalidations by release_folio */
atomic_t n_read_dir; /* Number of directory pages read */
atomic_t n_dir_cr; /* Number of directory entry creation edits */
atomic_t n_dir_rm; /* Number of directory entry removal edits */
@@ -364,9 +363,6 @@ struct afs_cell {
struct key *anonymous_key; /* anonymous user key for this cell */
struct work_struct manager; /* Manager for init/deinit/dns */
struct hlist_node proc_link; /* /proc cell list link */
-#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
-#endif
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
atomic_t ref; /* Struct refcount */
@@ -590,7 +586,7 @@ struct afs_volume {
#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
+ struct fscache_volume *cache; /* Caching cookie */
#endif
struct afs_server_list __rcu *servers; /* List of servers on which volume resides */
rwlock_t servers_lock; /* Lock for ->servers */
@@ -623,15 +619,11 @@ enum afs_lock_state {
* leak from one inode to another.
*/
struct afs_vnode {
- struct inode vfs_inode; /* the VFS's inode record */
-
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
afs_dataversion_t invalid_before; /* Child dentries are invalid before this */
-#ifdef CONFIG_AFS_FSCACHE
- struct fscache_cookie *cache; /* caching cookie */
-#endif
struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */
struct mutex io_lock; /* Lock for serialising I/O on this mutex */
struct rw_semaphore validate_lock; /* lock for validating this vnode */
@@ -678,12 +670,20 @@ struct afs_vnode {
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
- return vnode->cache;
+ return netfs_i_cookie(&vnode->netfs);
#else
return NULL;
#endif
}
+static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
+ struct fscache_cookie *cookie)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ vnode->netfs.cache = cookie;
+#endif
+}
+
/*
* cached security record for one user's attempt to access a vnode
*/
@@ -872,9 +872,24 @@ struct afs_operation {
* Cache auxiliary data.
*/
struct afs_vnode_cache_aux {
- u64 data_version;
+ __be64 data_version;
} __packed;
+static inline void afs_set_cache_aux(struct afs_vnode *vnode,
+ struct afs_vnode_cache_aux *aux)
+{
+ aux->data_version = cpu_to_be64(vnode->status.data_version);
+}
+
+static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags)
+{
+ struct afs_vnode_cache_aux aux;
+
+ afs_set_cache_aux(vnode, &aux);
+ fscache_invalidate(afs_vnode_cache(vnode), &aux,
+ i_size_read(&vnode->netfs.inode), flags);
+}
+
/*
* We use folio->private to hold the amount of the folio that we've written to,
* splitting the field into two parts. However, we need to represent a range
@@ -962,13 +977,6 @@ extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
*/
#ifdef CONFIG_AFS_FSCACHE
extern struct fscache_netfs afs_cache_netfs;
-extern struct fscache_cookie_def afs_cell_cache_index_def;
-extern struct fscache_cookie_def afs_volume_cache_index_def;
-extern struct fscache_cookie_def afs_vnode_cache_index_def;
-#else
-#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
-#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
-#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
#endif
/*
@@ -1059,7 +1067,7 @@ extern const struct address_space_operations afs_file_aops;
extern const struct address_space_operations afs_symlink_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
-extern const struct netfs_read_request_ops afs_req_ops;
+extern const struct netfs_request_ops afs_req_ops;
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
extern void afs_put_wb_key(struct afs_wb_key *);
@@ -1068,6 +1076,7 @@ extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
+extern int afs_write_inode(struct inode *, struct writeback_control *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
@@ -1203,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode)
static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
{
- return afs_i2net(&vnode->vfs_inode);
+ return afs_i2net(&vnode->netfs.inode);
}
static inline struct afs_net *afs_sock2net(struct sock *sk)
@@ -1506,7 +1515,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
* volume.c
*/
extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
-extern void afs_activate_volume(struct afs_volume *);
+extern int afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
@@ -1515,9 +1524,13 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
-extern int afs_set_page_dirty(struct page *);
+#ifdef CONFIG_AFS_FSCACHE
+bool afs_dirty_folio(struct address_space *, struct folio *);
+#else
+#define afs_dirty_folio filemap_dirty_folio
+#endif
extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
extern int afs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1528,7 +1541,7 @@ extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-extern int afs_launder_page(struct page *);
+int afs_launder_folio(struct folio *);
/*
* xattr.c
@@ -1575,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *);
*/
static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
{
- return container_of(inode, struct afs_vnode, vfs_inode);
+ return container_of(inode, struct afs_vnode, netfs.inode);
}
static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
{
- return &vnode->vfs_inode;
+ return &vnode->netfs.inode;
}
/*
@@ -1603,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
*/
static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
{
- i_size_write(&vnode->vfs_inode, size);
- vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+ i_size_write(&vnode->netfs.inode, size);
+ vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1;
}
/*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 179004b15566..eae288c8d40a 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -186,13 +186,6 @@ static int __init afs_init(void)
if (!afs_lock_manager)
goto error_lockmgr;
-#ifdef CONFIG_AFS_FSCACHE
- /* we want to be able to cache */
- ret = fscache_register_netfs(&afs_cache_netfs);
- if (ret < 0)
- goto error_cache;
-#endif
-
ret = register_pernet_device(&afs_net_ops);
if (ret < 0)
goto error_net;
@@ -215,10 +208,6 @@ error_proc:
error_fs:
unregister_pernet_device(&afs_net_ops);
error_net:
-#ifdef CONFIG_AFS_FSCACHE
- fscache_unregister_netfs(&afs_cache_netfs);
-error_cache:
-#endif
destroy_workqueue(afs_lock_manager);
error_lockmgr:
destroy_workqueue(afs_async_calls);
@@ -245,9 +234,6 @@ static void __exit afs_exit(void)
proc_remove(afs_proc_symlink);
afs_fs_exit();
unregister_pernet_device(&afs_net_ops);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_unregister_netfs(&afs_cache_netfs);
-#endif
destroy_workqueue(afs_lock_manager);
destroy_workqueue(afs_async_calls);
destroy_workqueue(afs_wq);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 1d1a8debe472..933e67fcdab1 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -163,8 +163,11 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
return;
case -ECONNABORTED:
+ error = afs_abort_to_error(abort_code);
+ fallthrough;
+ case -ENETRESET: /* Responded, but we seem to have changed address */
e->responded = true;
- e->error = afs_abort_to_error(abort_code);
+ e->error = error;
return;
}
}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 065a28bfa3f1..e1b863449296 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
__acquires(cell->proc_lock)
{
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_cell *cell = pde_data(file_inode(m->file));
rcu_read_lock();
return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos);
@@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v,
loff_t *_pos)
{
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_cell *cell = pde_data(file_inode(m->file));
return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos);
}
@@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
{
struct afs_vl_seq_net_private *priv = m->private;
struct afs_vlserver_list *vllist;
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_cell *cell = pde_data(file_inode(m->file));
loff_t pos = *_pos;
rcu_read_lock();
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 79e1a5f6701b..a840c3588ebb 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -292,6 +292,10 @@ bool afs_select_fileserver(struct afs_operation *op)
op->error = error;
goto iterate_address;
+ case -ENETRESET:
+ pr_warn("kAFS: Peer reset %s (op=%x)\n",
+ op->type ? op->type->name : "???", op->debug_id);
+ fallthrough;
case -ECONNRESET:
_debug("call reset");
op->error = error;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 23a1a92d64bb..a5434f3e57c6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -537,6 +537,8 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
+ case -ENOMEM:
+ case -EFAULT:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
@@ -544,7 +546,7 @@ static void afs_deliver_to_call(struct afs_call *call)
abort_code, ret, "KUM");
goto local_abort;
default:
- abort_code = RX_USER_ABORT;
+ abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KER");
goto local_abort;
@@ -836,7 +838,7 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
fallthrough;
default:
_leave(" [error]");
@@ -878,7 +880,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3c7a8fc4f93f..7c6a63a30394 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -219,8 +219,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
* yet.
*/
size++;
- new = kzalloc(sizeof(struct afs_permits) +
- sizeof(struct afs_permit) * size, GFP_NOFS);
+ new = kzalloc(struct_size(new, permits, size), GFP_NOFS);
if (!new)
goto out_put;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index d110def8aa8e..95d713074dc8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,6 +55,7 @@ int afs_net_id;
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
+ .write_inode = afs_write_inode,
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.free_inode = afs_free_inode,
@@ -658,7 +659,7 @@ static void afs_i_init_once(void *_vnode)
struct afs_vnode *vnode = _vnode;
memset(vnode, 0, sizeof(*vnode));
- inode_init_once(&vnode->vfs_inode);
+ inode_init_once(&vnode->netfs.inode);
mutex_init(&vnode->io_lock);
init_rwsem(&vnode->validate_lock);
spin_lock_init(&vnode->wb_lock);
@@ -667,6 +668,7 @@ static void afs_i_init_once(void *_vnode)
INIT_LIST_HEAD(&vnode->pending_locks);
INIT_LIST_HEAD(&vnode->granted_locks);
INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
+ INIT_LIST_HEAD(&vnode->cb_mmap_link);
seqlock_init(&vnode->cb_lock);
}
@@ -677,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
{
struct afs_vnode *vnode;
- vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+ vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL);
if (!vnode)
return NULL;
@@ -686,13 +688,11 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
/* Reset anything that shouldn't leak from one inode to the next. */
memset(&vnode->fid, 0, sizeof(vnode->fid));
memset(&vnode->status, 0, sizeof(vnode->status));
+ afs_vnode_set_cache(vnode, NULL);
vnode->volume = NULL;
vnode->lock_key = NULL;
vnode->permit_cache = NULL;
-#ifdef CONFIG_AFS_FSCACHE
- vnode->cache = NULL;
-#endif
vnode->flags = 1 << AFS_VNODE_UNSET;
vnode->lock_state = AFS_VNODE_LOCK_NONE;
@@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
init_rwsem(&vnode->rmdir_lock);
INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work);
- _leave(" = %p", &vnode->vfs_inode);
- return &vnode->vfs_inode;
+ _leave(" = %p", &vnode->netfs.inode);
+ return &vnode->netfs.inode;
}
static void afs_free_inode(struct inode *inode)
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index f84194b791d3..cc665cef0abe 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -9,8 +9,7 @@
#include <linux/slab.h>
#include "internal.h"
-unsigned __read_mostly afs_volume_gc_delay = 10;
-unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static unsigned __read_mostly afs_volume_record_life = 60 * 60;
/*
* Insert a volume into a cell. If there's an existing volume record, that is
@@ -268,15 +267,30 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
/*
* Activate a volume.
*/
-void afs_activate_volume(struct afs_volume *volume)
+int afs_activate_volume(struct afs_volume *volume)
{
#ifdef CONFIG_AFS_FSCACHE
- volume->cache = fscache_acquire_cookie(volume->cell->cache,
- &afs_volume_cache_index_def,
- &volume->vid, sizeof(volume->vid),
- NULL, 0,
- volume, 0, true);
+ struct fscache_volume *vcookie;
+ char *name;
+
+ name = kasprintf(GFP_KERNEL, "afs,%s,%llx",
+ volume->cell->name, volume->vid);
+ if (!name)
+ return -ENOMEM;
+
+ vcookie = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ kfree(name);
+ return PTR_ERR(vcookie);
+ }
+ pr_err("AFS: Cache volume key already in use (%s)\n", name);
+ vcookie = NULL;
+ }
+ volume->cache = vcookie;
+ kfree(name);
#endif
+ return 0;
}
/*
@@ -287,7 +301,7 @@ void afs_deactivate_volume(struct afs_volume *volume)
_enter("%s", volume->name);
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(volume->cache, NULL,
+ fscache_relinquish_volume(volume->cache, NULL,
test_bit(AFS_VOLUME_DELETED, &volume->flags));
volume->cache = NULL;
#endif
diff --git a/fs/afs/write.c b/fs/afs/write.c
index ca4909baf5e6..2c885b22de34 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -12,23 +12,37 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/netfs.h>
-#include <linux/fscache.h>
#include "internal.h"
+static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
+ loff_t i_size, bool caching);
+
+#ifdef CONFIG_AFS_FSCACHE
/*
- * mark a page as having been made dirty and thus needing writeback
+ * Mark a page as having been made dirty and thus needing writeback. We also
+ * need to pin the cache object to write back to.
*/
-int afs_set_page_dirty(struct page *page)
+bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ return fscache_dirty_folio(mapping, folio,
+ afs_vnode_cache(AFS_FS_I(mapping->host)));
+}
+static void afs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void afs_folio_start_fscache(bool caching, struct folio *folio)
{
- _enter("");
- return __set_page_dirty_nobuffers(page);
}
+#endif
/*
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **_page, void **fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
@@ -46,8 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata,
- &afs_req_ops, NULL);
+ ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
if (ret < 0)
return ret;
@@ -114,7 +127,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
unsigned long priv;
unsigned int f, from = offset_in_folio(folio, pos);
unsigned int t, to = from + copied;
- loff_t i_size, maybe_i_size;
+ loff_t i_size, write_end_pos;
_enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
@@ -131,15 +144,16 @@ int afs_write_end(struct file *file, struct address_space *mapping,
if (copied == 0)
goto out;
- maybe_i_size = pos + copied;
+ write_end_pos = pos + copied;
- i_size = i_size_read(&vnode->vfs_inode);
- if (maybe_i_size > i_size) {
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (write_end_pos > i_size) {
write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->vfs_inode);
- if (maybe_i_size > i_size)
- afs_set_i_size(vnode, maybe_i_size);
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (write_end_pos > i_size)
+ afs_set_i_size(vnode, write_end_pos);
write_sequnlock(&vnode->cb_lock);
+ fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos);
}
if (folio_test_private(folio)) {
@@ -243,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t end;
@@ -342,7 +356,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
{
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
- loff_t size = iov_iter_count(iter), i_size;
+ loff_t size = iov_iter_count(iter);
int ret = -ENOKEY;
_enter("%s{%llx:%llu.%u},%llx,%llx",
@@ -364,17 +378,15 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
return -ENOMEM;
}
- i_size = i_size_read(&vnode->vfs_inode);
-
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
op->file[0].modification = true;
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
- op->store.i_size = max(pos + size, i_size);
+ op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
op->store.laundering = laundering;
- op->mtime = vnode->vfs_inode.i_mtime;
+ op->mtime = vnode->netfs.inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
@@ -418,6 +430,7 @@ static void afs_extend_writeback(struct address_space *mapping,
loff_t start,
loff_t max_len,
bool new_content,
+ bool caching,
unsigned int *_len)
{
struct pagevec pvec;
@@ -464,7 +477,9 @@ static void afs_extend_writeback(struct address_space *mapping,
folio_put(folio);
break;
}
- if (!folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
folio_unlock(folio);
folio_put(folio);
break;
@@ -512,6 +527,7 @@ static void afs_extend_writeback(struct address_space *mapping,
BUG();
if (folio_start_writeback(folio))
BUG();
+ afs_folio_start_fscache(caching, folio);
*_count -= folio_nr_pages(folio);
folio_unlock(folio);
@@ -537,8 +553,9 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
struct iov_iter iter;
unsigned long priv;
unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->vfs_inode);
+ loff_t i_size = i_size_read(&vnode->netfs.inode);
bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+ bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
long count = wbc->nr_to_write;
int ret;
@@ -546,6 +563,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
if (folio_start_writeback(folio))
BUG();
+ afs_folio_start_fscache(caching, folio);
count -= folio_nr_pages(folio);
@@ -572,7 +590,8 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
if (len < max_len &&
(to == folio_size(folio) || new_content))
afs_extend_writeback(mapping, vnode, &count,
- start, max_len, new_content, &len);
+ start, max_len, new_content,
+ caching, &len);
len = min_t(loff_t, len, max_len);
}
@@ -585,12 +604,18 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
if (start < i_size) {
_debug("write back %x @%llx [%llx]", len, start, i_size);
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ afs_write_to_cache(vnode, start, len, i_size, caching);
+
iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
ret = afs_store_data(vnode, &iter, start, false);
} else {
_debug("write discard %x @%llx [%llx]", len, start, i_size);
/* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
afs_pages_written_back(vnode, start, len);
ret = 0;
}
@@ -610,6 +635,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
+ case -ENETRESET:
afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, ret);
break;
@@ -649,6 +675,10 @@ int afs_writepage(struct page *subpage, struct writeback_control *wbc)
_enter("{%lx},", folio_index(folio));
+#ifdef CONFIG_AFS_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+
start = folio_index(folio) * PAGE_SIZE;
ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc,
folio, start, LLONG_MAX - start);
@@ -671,7 +701,7 @@ static int afs_writepages_region(struct address_space *mapping,
struct folio *folio;
struct page *head_page;
ssize_t ret;
- int n;
+ int n, skips = 0;
_enter("%llx,%llx,", start, end);
@@ -714,11 +744,23 @@ static int afs_writepages_region(struct address_space *mapping,
continue;
}
- if (folio_test_writeback(folio)) {
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE)
+ if (wbc->sync_mode != WB_SYNC_NONE) {
folio_wait_writeback(folio);
+#ifdef CONFIG_AFS_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ } else {
+ start += folio_size(folio);
+ }
folio_put(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched())
+ break;
+ skips++;
+ }
continue;
}
@@ -802,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
_enter("{%llx:%llu},{%zu},",
vnode->fid.vid, vnode->fid.vnode, count);
- if (IS_SWAPFILE(&vnode->vfs_inode)) {
+ if (IS_SWAPFILE(&vnode->netfs.inode)) {
printk(KERN_INFO
"AFS: Attempt to write to active swap file!\n");
return -EBUSY;
@@ -915,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/* Discard unused keys */
spin_lock(&vnode->wb_lock);
- if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
- !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+ if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+ !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) {
list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
if (refcount_read(&wbk->usage) == 1)
list_move(&wbk->vnode_link, &graveyard);
@@ -935,9 +977,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/*
* Clean up a page during invalidation.
*/
-int afs_launder_page(struct page *subpage)
+int afs_launder_folio(struct folio *folio)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
struct iov_iter iter;
struct bio_vec bv[1];
@@ -945,7 +986,7 @@ int afs_launder_page(struct page *subpage)
unsigned int f, t;
int ret = 0;
- _enter("{%lx}", folio_index(folio));
+ _enter("{%lx}", folio->index);
priv = (unsigned long)folio_get_private(folio);
if (folio_clear_dirty_for_io(folio)) {
@@ -970,3 +1011,28 @@ int afs_launder_page(struct page *subpage)
folio_wait_fscache(folio);
return ret;
}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct afs_vnode *vnode = priv;
+
+ if (IS_ERR_VALUE(transferred_or_error) &&
+ transferred_or_error != -ENOBUFS)
+ afs_invalidate_cache(vnode, 0);
+}
+
+/*
+ * Save the write to the cache also.
+ */
+static void afs_write_to_cache(struct afs_vnode *vnode,
+ loff_t start, size_t len, loff_t i_size,
+ bool caching)
+{
+ fscache_write_to_cache(afs_vnode_cache(vnode),
+ vnode->netfs.inode.i_mapping, start, len, i_size,
+ afs_write_to_cache_done, vnode, caching);
+}
diff --git a/fs/aio.c b/fs/aio.c
index f6f1cbffef9e..3c249b938632 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -220,9 +220,35 @@ struct aio_kiocb {
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr; /* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+static unsigned long aio_nr; /* current system wide number of aio requests */
+static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
+#ifdef CONFIG_SYSCTL
+static struct ctl_table aio_sysctls[] = {
+ {
+ .procname = "aio-nr",
+ .data = &aio_nr,
+ .maxlen = sizeof(aio_nr),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "aio-max-nr",
+ .data = &aio_max_nr,
+ .maxlen = sizeof(aio_max_nr),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {}
+};
+
+static void __init aio_sysctl_init(void)
+{
+ register_sysctl_init("fs", aio_sysctls);
+}
+#else
+#define aio_sysctl_init() do { } while (0)
+#endif
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
@@ -275,6 +301,7 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ aio_sysctl_init();
return 0;
}
__initcall(aio_setup);
@@ -451,7 +478,7 @@ out:
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#if IS_ENABLED(CONFIG_MIGRATION)
.migratepage = aio_migratepage,
#endif
@@ -1451,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
- req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
/*
* If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
@@ -1526,7 +1552,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
file = req->ki_filp;
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
diff --git a/fs/attr.c b/fs/attr.c
index 66899b6e9bd8..dbe996b0dedf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns,
const struct inode *inode, kgid_t gid)
{
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
- (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
- return true;
+ if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
+ kgid_t mapped_gid;
+
+ if (gid_eq(gid, inode->i_gid))
+ return true;
+ mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
+ if (in_group_p(mapped_gid))
+ return true;
+ }
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
if (gid_eq(kgid, INVALID_GID) &&
@@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
+ kgid_t mapped_gid;
+
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
+
+ if (ia_valid & ATTR_GID)
+ mapped_gid = mapped_kgid_fs(mnt_userns,
+ i_user_ns(inode), attr->ia_gid);
+ else
+ mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+
/* Also check the setgid bit! */
- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
- i_gid_into_mnt(mnt_userns, inode)) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_p(mapped_gid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c1ba13d19024..be383fa46b12 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -40,7 +40,7 @@ MODULE_LICENSE("GPL");
static int befs_readdir(struct file *, struct dir_context *);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-static int befs_readpage(struct file *file, struct page *page);
+static int befs_read_folio(struct file *file, struct folio *folio);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
static struct dentry *befs_lookup(struct inode *, struct dentry *,
unsigned int);
@@ -48,7 +48,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_free_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static int befs_symlink_readpage(struct file *, struct page *);
+static int befs_symlink_read_folio(struct file *, struct folio *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -87,12 +87,12 @@ static const struct inode_operations befs_dir_inode_operations = {
};
static const struct address_space_operations befs_aops = {
- .readpage = befs_readpage,
+ .read_folio = befs_read_folio,
.bmap = befs_bmap,
};
static const struct address_space_operations befs_symlink_aops = {
- .readpage = befs_symlink_readpage,
+ .read_folio = befs_symlink_read_folio,
};
static const struct export_operations befs_export_operations = {
@@ -102,16 +102,16 @@ static const struct export_operations befs_export_operations = {
};
/*
- * Called by generic_file_read() to read a page of data
+ * Called by generic_file_read() to read a folio of data
*
* In turn, simply calls a generic block read function and
* passes it the address of befs_get_block, for mapping file
* positions to disk blocks.
*/
static int
-befs_readpage(struct file *file, struct page *page)
+befs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, befs_get_block);
+ return block_read_full_folio(folio, befs_get_block);
}
static sector_t
@@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb)
{
struct befs_inode_info *bi;
- bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
@@ -468,8 +468,9 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static int befs_symlink_readpage(struct file *unused, struct page *page)
+static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct befs_inode_info *befs_ino = BEFS_I(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 7f8544abf636..57ae5ee6deec 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -155,9 +155,9 @@ static int bfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, bfs_get_block, wbc);
}
-static int bfs_readpage(struct file *file, struct page *page)
+static int bfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, bfs_get_block);
+ return block_read_full_folio(folio, bfs_get_block);
}
static void bfs_write_failed(struct address_space *mapping, loff_t to)
@@ -169,13 +169,12 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
}
static int bfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- bfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block);
if (unlikely(ret))
bfs_write_failed(mapping, pos + len);
@@ -188,8 +187,9 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations bfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
- .readpage = bfs_readpage,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
+ .read_folio = bfs_read_folio,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fd691e4815c5..1926bec2c850 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep;
static struct inode *bfs_alloc_inode(struct super_block *sb)
{
struct bfs_inode_info *bi;
- bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8c7f26f1fbb..63c7ebb0da89 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_CORE_EFLAGS 0
#endif
-#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1))
#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
@@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
+#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
@@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss)
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
- int i, first_idx = -1, last_idx = -1;
+ elf_addr_t min_addr = -1;
+ elf_addr_t max_addr = 0;
+ bool pt_load = false;
+ int i;
for (i = 0; i < nr; i++) {
- if (cmds[i].p_type == PT_LOAD) {
- last_idx = i;
- if (first_idx == -1)
- first_idx = i;
+ if (phdr[i].p_type == PT_LOAD) {
+ min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr));
+ max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz);
+ pt_load = true;
}
}
- if (first_idx == -1)
- return 0;
-
- return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
- ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ return pt_load ? (max_addr - min_addr) : 0;
}
static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
@@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
- int load_addr_set = 0;
+ unsigned long load_bias = 0, phdr_addr = 0;
+ int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
@@ -1074,12 +1075,12 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * The first time through the loop, load_addr_set is false:
+ * The first time through the loop, first_pt_load is true:
* layout will be calculated. Once set, use MAP_FIXED since
* we know we've already safely mapped the entire region with
* MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (load_addr_set) {
+ if (!first_pt_load) {
elf_flags |= MAP_FIXED;
} else if (elf_ex->e_type == ET_EXEC) {
/*
@@ -1135,14 +1136,25 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
- }
- /*
- * Calculate the entire size of the ELF mapping (total_size).
- * (Note that load_addr_set is set to true later once the
- * initial mapping is performed.)
- */
- if (!load_addr_set) {
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
+ *
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1159,16 +1171,25 @@ out_free_interp:
goto out_free_dentry;
}
- if (!load_addr_set) {
- load_addr_set = 1;
- load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ if (first_pt_load) {
+ first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
- load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1204,6 +1225,7 @@ out_free_interp:
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1267,8 +1289,8 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
@@ -1585,7 +1607,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
- strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+ get_task_comm(psinfo->pr_fname, p);
return 0;
}
@@ -1619,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static int fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
user_long_t *start_end_ofs;
char *name_base, *name_curpos;
+ int i;
/* *Estimated* file count and total data size needed */
- count = mm->map_count;
+ count = cprm->vma_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1651,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note)
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = &cprm->vma_meta[i];
struct file *file;
const char *filename;
- file = vma->vm_file;
+ file = m->file;
if (!file)
continue;
filename = file_path(file, name_curpos, remaining);
@@ -1675,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note)
memmove(name_curpos, filename, n);
name_curpos += n;
- *start_end_ofs++ = vma->vm_start;
- *start_end_ofs++ = vma->vm_end;
- *start_end_ofs++ = vma->vm_pgoff;
+ *start_end_ofs++ = m->start;
+ *start_end_ofs++ = m->end;
+ *start_end_ofs++ = m->pgoff;
count++;
}
@@ -1688,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note)
* Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = mm->map_count - count;
+ n = cprm->vma_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -1744,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task,
static int fill_thread_core_info(struct elf_thread_core_info *t,
const struct user_regset_view *view,
- long signr, size_t *total)
+ long signr, struct elf_note_info *info)
{
- unsigned int i;
+ unsigned int note_iter, view_iter;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1760,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
- *total += notesize(&t->notes[0]);
+ info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
/*
* Each other regset might generate a note too. For each regset
- * that has no core_note_type or is inactive, we leave t->notes[i]
- * all zero and we'll know to skip writing it later.
+ * that has no core_note_type or is inactive, skip it.
*/
- for (i = 1; i < view->n; ++i) {
- const struct user_regset *regset = &view->regsets[i];
+ note_iter = 1;
+ for (view_iter = 1; view_iter < view->n; ++view_iter) {
+ const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
@@ -1786,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (ret < 0)
continue;
+ if (WARN_ON_ONCE(note_iter >= info->thread_notes))
+ break;
+
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
note_type, ret, data);
- *total += notesize(&t->notes[i]);
+ info->size += notesize(&t->notes[note_iter]);
+ note_iter++;
}
return 1;
@@ -1800,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1872,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info))
return 0;
/*
@@ -1881,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
- fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo);
info->size += notesize(&info->signote);
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- if (fill_files_note(&info->files) == 0)
+ if (fill_files_note(&info->files, cprm) == 0)
info->size += notesize(&info->files);
return 1;
@@ -2029,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
+ elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
/* Set up header */
fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2072,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
fill_auxv_note(info->notes + 3, current->mm);
info->numnote = 4;
- if (fill_files_note(info->notes + info->numnote) == 0) {
+ if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
info->notes_files = info->notes + info->numnote;
info->numnote++;
}
/* Try to dump the FPU. */
- info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
- info->fpu);
+ info->prstatus->pr_fpvalid =
+ elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
if (info->prstatus->pr_fpvalid)
fill_note(info->notes + info->numnote++,
"CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
@@ -2169,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs, i;
- size_t vma_data_size;
+ int segs, i;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2178,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- struct core_vma_metadata *vma_meta;
-
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- return 0;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2201,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
has_dumped = 1;
@@ -2226,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2246,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
@@ -2284,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Align to page */
dump_skip_to(cprm, dataoff);
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
@@ -2302,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
@@ -2324,3 +2344,7 @@ static void __exit exit_elf_binfmt(void)
core_initcall(init_elf_binfmt);
module_exit(exit_elf_binfmt);
MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
+#include "binfmt_elf_test.c"
+#endif
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c6f588dc4a9d..08d0c8797828 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = {
.load_binary = load_elf_fdpic_binary,
#ifdef CONFIG_ELF_CORE
.core_dump = elf_fdpic_core_dump,
-#endif
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
static int __init init_elf_fdpic_binfmt(void)
@@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs;
+ int segs;
int i;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
- struct core_vma_metadata *vma_meta = NULL;
- size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- goto end_coredump;
-
for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
size_t sz;
@@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dump_skip_to(cprm, dataoff);
- if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
+ if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1652,7 +1647,6 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
- kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c
new file mode 100644
index 000000000000..11d734fec366
--- /dev/null
+++ b/fs/binfmt_elf_test.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+static void total_mapping_size_test(struct kunit *test)
+{
+ struct elf_phdr empty[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, },
+ { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, },
+ };
+ /*
+ * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \
+ * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}'
+ */
+ struct elf_phdr mount[] = {
+ { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, },
+ { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, },
+ { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, },
+ { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, },
+ { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, },
+ };
+ size_t mount_size = 0xE070;
+ /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */
+ struct elf_phdr unordered[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ };
+
+ /* No headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0);
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0);
+ /* Empty headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0);
+ /* No PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0);
+ /* Empty PT_LOAD and non-PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0);
+
+ /* Normal set of PT_LOADS, and expected size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size);
+ /* Unordered PT_LOADs result in same size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size);
+}
+
+static struct kunit_case binfmt_elf_test_cases[] = {
+ KUNIT_CASE(total_mapping_size_test),
+ {},
+};
+
+static struct kunit_suite binfmt_elf_test_suite = {
+ .name = KBUILD_MODNAME,
+ .test_cases = binfmt_elf_test_cases,
+};
+
+kunit_test_suite(binfmt_elf_test_suite);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d776f80ee50..c26545d71d39 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -68,11 +68,7 @@
#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-#define MAX_SHARED_LIBS (4)
-#else
-#define MAX_SHARED_LIBS (1)
-#endif
+#define MAX_SHARED_LIBS (1)
#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
#define DATA_START_OFFSET_WORDS (0)
@@ -92,32 +88,13 @@ struct lib_info {
} lib_list[MAX_SHARED_LIBS];
};
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-static int load_flat_shared_library(int id, struct lib_info *p);
-#endif
-
static int load_flat_binary(struct linux_binprm *);
-static int flat_core_dump(struct coredump_params *cprm);
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
- .core_dump = flat_core_dump,
- .min_coredump = PAGE_SIZE
};
-/****************************************************************************/
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- */
-
-static int flat_core_dump(struct coredump_params *cprm)
-{
- pr_warn("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, cprm->siginfo->si_signo);
- return 1;
-}
/****************************************************************************/
/*
@@ -322,51 +299,18 @@ out_free:
/****************************************************************************/
static unsigned long
-calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
+calc_reloc(unsigned long r, struct lib_info *p)
{
unsigned long addr;
- int id;
unsigned long start_brk;
unsigned long start_data;
unsigned long text_len;
unsigned long start_code;
-#ifdef CONFIG_BINFMT_SHARED_FLAT
- if (r == 0)
- id = curid; /* Relocs of 0 are always self referring */
- else {
- id = (r >> 24) & 0xff; /* Find ID for this reloc */
- r &= 0x00ffffff; /* Trim ID off here */
- }
- if (id >= MAX_SHARED_LIBS) {
- pr_err("reference 0x%lx to shared library %d", r, id);
- goto failed;
- }
- if (curid != id) {
- if (internalp) {
- pr_err("reloc address 0x%lx not in same module "
- "(%d != %d)", r, curid, id);
- goto failed;
- } else if (!p->lib_list[id].loaded &&
- load_flat_shared_library(id, p) < 0) {
- pr_err("failed to load library %d", id);
- goto failed;
- }
- /* Check versioning information (i.e. time stamps) */
- if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
- p->lib_list[curid].build_date < p->lib_list[id].build_date) {
- pr_err("library %d is younger than %d", id, curid);
- goto failed;
- }
- }
-#else
- id = 0;
-#endif
-
- start_brk = p->lib_list[id].start_brk;
- start_data = p->lib_list[id].start_data;
- start_code = p->lib_list[id].start_code;
- text_len = p->lib_list[id].text_len;
+ start_brk = p->lib_list[0].start_brk;
+ start_data = p->lib_list[0].start_data;
+ start_code = p->lib_list[0].start_code;
+ text_len = p->lib_list[0].text_len;
if (r > start_brk - start_data + text_len) {
pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
@@ -433,8 +377,32 @@ static void old_reloc(unsigned long rl)
/****************************************************************************/
+static inline u32 __user *skip_got_header(u32 __user *rp)
+{
+ if (IS_ENABLED(CONFIG_RISCV)) {
+ /*
+ * RISC-V has a 16 byte GOT PLT header for elf64-riscv
+ * and 8 byte GOT PLT header for elf32-riscv.
+ * Skip the whole GOT PLT header, since it is reserved
+ * for the dynamic linker (ld.so).
+ */
+ u32 rp_val0, rp_val1;
+
+ if (get_user(rp_val0, rp))
+ return rp;
+ if (get_user(rp_val1, rp + 1))
+ return rp;
+
+ if (rp_val0 == 0xffffffff && rp_val1 == 0xffffffff)
+ rp += 4;
+ else if (rp_val0 == 0xffffffff)
+ rp += 2;
+ }
+ return rp;
+}
+
static int load_flat_file(struct linux_binprm *bprm,
- struct lib_info *libinfo, int id, unsigned long *extra_stack)
+ struct lib_info *libinfo, unsigned long *extra_stack)
{
struct flat_hdr *hdr;
unsigned long textpos, datapos, realdatastart;
@@ -486,14 +454,6 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- /* Don't allow old format executables to use shared libraries */
- if (rev == OLD_FLAT_VERSION && id != 0) {
- pr_err("shared libraries are not available before rev 0x%lx\n",
- FLAT_VERSION);
- ret = -ENOEXEC;
- goto err;
- }
-
/*
* fix up the flags for the older format, there were all kinds
* of endian hacks, this only works for the simple cases
@@ -544,15 +504,13 @@ static int load_flat_file(struct linux_binprm *bprm,
}
/* Flush all traces of the currently running executable */
- if (id == 0) {
- ret = begin_new_exec(bprm);
- if (ret)
- goto err;
+ ret = begin_new_exec(bprm);
+ if (ret)
+ goto err;
- /* OK, This is the point of no return */
- set_personality(PER_LINUX_32BIT);
- setup_new_exec(bprm);
- }
+ /* OK, This is the point of no return */
+ set_personality(PER_LINUX_32BIT);
+ setup_new_exec(bprm);
/*
* calculate the extra space we need to map in
@@ -732,42 +690,40 @@ static int load_flat_file(struct linux_binprm *bprm,
text_len -= sizeof(struct flat_hdr); /* the real code len */
/* The main program needs a little extra setup in the task structure */
- if (id == 0) {
- current->mm->start_code = start_code;
- current->mm->end_code = end_code;
- current->mm->start_data = datapos;
- current->mm->end_data = datapos + data_len;
- /*
- * set up the brk stuff, uses any slack left in data/bss/stack
- * allocation. We put the brk after the bss (between the bss
- * and stack) like other platforms.
- * Userspace code relies on the stack pointer starting out at
- * an address right at the end of a page.
- */
- current->mm->start_brk = datapos + data_len + bss_len;
- current->mm->brk = (current->mm->start_brk + 3) & ~3;
+ current->mm->start_code = start_code;
+ current->mm->end_code = end_code;
+ current->mm->start_data = datapos;
+ current->mm->end_data = datapos + data_len;
+ /*
+ * set up the brk stuff, uses any slack left in data/bss/stack
+ * allocation. We put the brk after the bss (between the bss
+ * and stack) like other platforms.
+ * Userspace code relies on the stack pointer starting out at
+ * an address right at the end of a page.
+ */
+ current->mm->start_brk = datapos + data_len + bss_len;
+ current->mm->brk = (current->mm->start_brk + 3) & ~3;
#ifndef CONFIG_MMU
- current->mm->context.end_brk = memp + memp_size - stack_len;
+ current->mm->context.end_brk = memp + memp_size - stack_len;
#endif
- }
if (flags & FLAT_FLAG_KTRACE) {
pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n",
textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n",
- id ? "Lib" : "Load", bprm->filename,
+ "Load", bprm->filename,
start_code, end_code, datapos, datapos + data_len,
datapos + data_len, (datapos + data_len + bss_len + 3) & ~3);
}
/* Store the current module values into the global library structure */
- libinfo->lib_list[id].start_code = start_code;
- libinfo->lib_list[id].start_data = datapos;
- libinfo->lib_list[id].start_brk = datapos + data_len + bss_len;
- libinfo->lib_list[id].text_len = text_len;
- libinfo->lib_list[id].loaded = 1;
- libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
- libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
+ libinfo->lib_list[0].start_code = start_code;
+ libinfo->lib_list[0].start_data = datapos;
+ libinfo->lib_list[0].start_brk = datapos + data_len + bss_len;
+ libinfo->lib_list[0].text_len = text_len;
+ libinfo->lib_list[0].loaded = 1;
+ libinfo->lib_list[0].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
+ libinfo->lib_list[0].build_date = ntohl(hdr->build_date);
/*
* We just load the allocations into some temporary memory to
@@ -782,14 +738,15 @@ static int load_flat_file(struct linux_binprm *bprm,
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (u32 __user *)datapos; ; rp++) {
+ rp = skip_got_header((u32 __user *) datapos);
+ for (; ; rp++) {
u32 addr, rp_val;
if (get_user(rp_val, rp))
return -EFAULT;
if (rp_val == 0xffffffff)
break;
if (rp_val) {
- addr = calc_reloc(rp_val, libinfo, id, 0);
+ addr = calc_reloc(rp_val, libinfo);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -825,7 +782,7 @@ static int load_flat_file(struct linux_binprm *bprm,
return -EFAULT;
relval = ntohl(tmp);
addr = flat_get_relocate_addr(relval);
- rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1);
+ rp = (u32 __user *)calc_reloc(addr, libinfo);
if (rp == (u32 __user *)RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -848,7 +805,7 @@ static int load_flat_file(struct linux_binprm *bprm,
*/
addr = ntohl((__force __be32)addr);
}
- addr = calc_reloc(addr, libinfo, id, 0);
+ addr = calc_reloc(addr, libinfo);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -876,7 +833,7 @@ static int load_flat_file(struct linux_binprm *bprm,
/* zero the BSS, BRK and stack areas */
if (clear_user((void __user *)(datapos + data_len), bss_len +
(memp + memp_size - stack_len - /* end brk */
- libinfo->lib_list[id].start_brk) + /* start brk */
+ libinfo->lib_list[0].start_brk) + /* start brk */
stack_len))
return -EFAULT;
@@ -887,49 +844,6 @@ err:
/****************************************************************************/
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-
-/*
- * Load a shared library into memory. The library gets its own data
- * segment (including bss) but not argv/argc/environ.
- */
-
-static int load_flat_shared_library(int id, struct lib_info *libs)
-{
- /*
- * This is a fake bprm struct; only the members "buf", "file" and
- * "filename" are actually used.
- */
- struct linux_binprm bprm;
- int res;
- char buf[16];
- loff_t pos = 0;
-
- memset(&bprm, 0, sizeof(bprm));
-
- /* Create the file name */
- sprintf(buf, "/lib/lib%d.so", id);
-
- /* Open the file up */
- bprm.filename = buf;
- bprm.file = open_exec(bprm.filename);
- res = PTR_ERR(bprm.file);
- if (IS_ERR(bprm.file))
- return res;
-
- res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos);
-
- if (res >= 0)
- res = load_flat_file(&bprm, libs, id, NULL);
-
- allow_write_access(bprm.file);
- fput(bprm.file);
-
- return res;
-}
-
-#endif /* CONFIG_BINFMT_SHARED_FLAT */
-/****************************************************************************/
/*
* These are the functions used to load flat style executables and shared
@@ -961,7 +875,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN);
- res = load_flat_file(bprm, &libinfo, 0, &stack_len);
+ res = load_flat_file(bprm, &libinfo, &stack_len);
if (res < 0)
return res;
@@ -1006,20 +920,6 @@ static int load_flat_binary(struct linux_binprm *bprm)
*/
start_addr = libinfo.lib_list[0].entry;
-#ifdef CONFIG_BINFMT_SHARED_FLAT
- for (i = MAX_SHARED_LIBS-1; i > 0; i--) {
- if (libinfo.lib_list[i].loaded) {
- /* Push previos first to call address */
- unsigned long __user *sp;
- current->mm->start_stack -= sizeof(unsigned long);
- sp = (unsigned long __user *)current->mm->start_stack;
- if (put_user(start_addr, sp))
- return -EFAULT;
- start_addr = libinfo.lib_list[i].entry;
- }
- }
-#endif
-
#ifdef FLAT_PLAT_INIT
FLAT_PLAT_INIT(regs);
#endif
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 520a0f6a7d9e..183e5c4aed34 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -18,8 +18,7 @@ config BTRFS_FS
select RAID6_PQ
select XOR_BLOCKS
select SRCU
- depends on !PPC_256K_PAGES # powerpc
- depends on !PAGE_SIZE_256KB # hexagon
+ depends on PAGE_SIZE_LESS_THAN_256KB
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3dcf9bcc2326..99f9995670ea 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags)
subdir-ccflags-y += -Wno-missing-field-initializers
subdir-ccflags-y += -Wno-sign-compare
subdir-ccflags-y += -Wno-type-limits
+subdir-ccflags-y += -Wno-shift-negative-value
obj-$(CONFIG_BTRFS_FS) := btrfs.o
@@ -27,7 +28,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0a0d0eccee4e..548d6a5477b4 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type)
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
+ ret = __btrfs_set_acl(NULL, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- int ret = 0;
-
- /* this happens with subvols */
- if (!dir)
- return 0;
-
- ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (ret)
- return ret;
-
- if (default_acl) {
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
-
- if (acl) {
- if (!ret)
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
- ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- }
-
- if (!default_acl && !acl)
- cache_no_acl(inode);
- return ret;
-}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 43c89952b7d2..aac240430efe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -15,13 +15,12 @@
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
- WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
-struct __btrfs_workqueue {
+struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
@@ -48,12 +47,7 @@ struct __btrfs_workqueue {
spinlock_t thres_lock;
};
-struct btrfs_workqueue {
- struct __btrfs_workqueue *normal;
- struct __btrfs_workqueue *high;
-};
-
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
return wq->fs_info;
}
@@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
- * We could compare wq->normal->pending with num_online_cpus()
+ * We could compare wq->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
- if (wq->normal->thresh == NO_THRESHOLD)
+ if (wq->thresh == NO_THRESHOLD)
return false;
- return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+ return atomic_read(&wq->pending) > wq->thresh * 2;
}
-static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
- unsigned int flags, int limit_active, int thresh)
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name, unsigned int flags,
+ int limit_active, int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
ret->thresh = thresh;
}
- if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
- ret->current_active, name);
- else
- ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
- ret->current_active, name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
+ name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
@@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
- trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
- return ret;
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-
-struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
- const char *name,
- unsigned int flags,
- int limit_active,
- int thresh)
-{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
-
- if (!ret)
- return NULL;
-
- ret->normal = __btrfs_alloc_workqueue(fs_info, name,
- flags & ~WQ_HIGHPRI,
- limit_active, thresh);
- if (!ret->normal) {
- kfree(ret);
- return NULL;
- }
-
- if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
- limit_active, thresh);
- if (!ret->high) {
- __btrfs_destroy_workqueue(ret->normal);
- kfree(ret);
- return NULL;
- }
- }
+ trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
@@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
-static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
@@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
-static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
@@ -217,7 +173,7 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq,
+static void run_ordered_work(struct btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
@@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq = work->wq;
int need_order = 0;
/*
@@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work)
*/
if (work->ordered_func)
need_order = 1;
- wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
@@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
work->flags = 0;
}
-static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
- struct btrfs_work *work)
+void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
unsigned long flags;
@@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
queue_work(wq->normal_wq, &work->normal_work);
}
-void btrfs_queue_work(struct btrfs_workqueue *wq,
- struct btrfs_work *work)
-{
- struct __btrfs_workqueue *dest_wq;
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
- dest_wq = wq->high;
- else
- dest_wq = wq->normal;
- __btrfs_queue_work(dest_wq, work);
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
-{
- destroy_workqueue(wq->normal_wq);
- trace_btrfs_workqueue_destroy(wq);
- kfree(wq);
-}
-
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
- if (wq->high)
- __btrfs_destroy_workqueue(wq->high);
- __btrfs_destroy_workqueue(wq->normal);
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
- if (!wq)
- return;
- wq->normal->limit_active = limit_active;
- if (wq->high)
- wq->high->limit_active = limit_active;
-}
-
-void btrfs_set_work_high_priority(struct btrfs_work *work)
-{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ if (wq)
+ wq->limit_active = limit_active;
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
- if (wq->high)
- flush_workqueue(wq->high->normal_wq);
-
- flush_workqueue(wq->normal->normal_wq);
+ flush_workqueue(wq->normal_wq);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 3204daa51b95..07960529b360 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -11,8 +11,6 @@
struct btrfs_fs_info;
struct btrfs_workqueue;
-/* Internal use only */
-struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_work_func_t)(struct work_struct *arg);
@@ -25,7 +23,7 @@ struct btrfs_work {
/* Don't touch things below */
struct work_struct normal_work;
struct list_head ordered_list;
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq;
unsigned long flags;
};
@@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
-void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f735b8798ba1..ebc392ea1d74 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
}
+
if (lock)
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
@@ -950,7 +952,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
@@ -1049,12 +1051,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_keyed_refs(struct btrfs_fs_info *fs_info,
+static int add_keyed_refs(struct btrfs_root *extent_root,
struct btrfs_path *path, u64 bytenr,
int info_level, struct preftrees *preftrees,
struct share_check *sc)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
int ret;
int slot;
struct extent_buffer *leaf;
@@ -1170,6 +1172,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct ulist *roots, const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1203,28 +1206,26 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
- /*
- * grab both a lock on the path and a lock on the delayed ref head.
- * We need both to get a consistent picture of how the refs look
- * at a specified point in time
- */
again:
head = NULL;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
time_seq != BTRFS_SEQ_LAST) {
-#else
- if (trans && time_seq != BTRFS_SEQ_LAST) {
-#endif
/*
- * look if there are updates for this ref queued and lock the
- * head
+ * We have a specific time_seq we care about and trans which
+ * means we have the path lock, we need to grab the ref head and
+ * lock it so we have a consistent view of the refs at the given
+ * time.
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -1271,7 +1272,7 @@ again:
&info_level, &preftrees, sc);
if (ret)
goto out;
- ret = add_keyed_refs(fs_info, path, bytenr, info_level,
+ ret = add_keyed_refs(root, path, bytenr, info_level,
&preftrees, sc);
if (ret)
goto out;
@@ -1336,7 +1337,8 @@ again:
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1360,10 +1362,18 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
@@ -1740,6 +1750,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags_ret)
{
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
int ret;
u64 flags;
u64 size = 0;
@@ -1755,11 +1766,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
key.objectid = logical;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1779,7 +1790,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
@@ -1962,7 +1973,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
extent_item_objectid);
if (!search_commit_root) {
- trans = btrfs_attach_transaction(fs_info->extent_root);
+ trans = btrfs_attach_transaction(fs_info->tree_root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT &&
PTR_ERR(trans) != -EROFS)
@@ -2058,7 +2069,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u64 parent = 0;
int found = 0;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2084,10 +2094,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item = btrfs_item_nr(slot);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
- for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
@@ -2143,7 +2152,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr_offset(eb, slot);
cur_offset = 0;
@@ -2330,6 +2339,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
{
struct btrfs_fs_info *fs_info = iter->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_path *path = iter->path;
struct btrfs_extent_item *ei;
struct btrfs_key key;
@@ -2340,7 +2350,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
key.offset = (u64)-1;
iter->bytenr = bytenr;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (ret == 0) {
@@ -2364,7 +2374,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->end_ptr = (u32)(iter->item_ptr +
- btrfs_item_size_nr(path->nodes[0], path->slots[0]));
+ btrfs_item_size(path->nodes[0], path->slots[0]));
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
@@ -2383,7 +2393,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
/* If there is no inline backref, go search for keyed backref */
if (iter->cur_ptr >= iter->end_ptr) {
- ret = btrfs_next_item(fs_info->extent_root, path);
+ ret = btrfs_next_item(extent_root, path);
/* No inline nor keyed ref */
if (ret > 0) {
@@ -2404,7 +2414,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->item_ptr = iter->cur_ptr;
- iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
path->nodes[0], path->slots[0]));
}
@@ -2427,6 +2437,7 @@ release:
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
{
struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct btrfs_root *extent_root;
struct btrfs_path *path = iter->path;
struct btrfs_extent_inline_ref *iref;
int ret;
@@ -2457,7 +2468,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
/* We're at keyed items, there is no inline item, go to the next one */
- ret = btrfs_next_item(iter->fs_info->extent_root, iter->path);
+ extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+ ret = btrfs_next_item(extent_root, iter->path);
if (ret)
return ret;
@@ -2469,7 +2481,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->cur_ptr = iter->item_ptr;
- iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
path->slots[0]);
return 0;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 444e9c89ff3e..ede389f2602d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
{
if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
- WARN_ON(cache->reserved > 0);
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on reserved > 0 in that
+ * case.
+ */
+ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
+ WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
@@ -159,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
+ bool leftmost = true;
ASSERT(block_group->length != 0);
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
+ write_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_root.rb_node;
while (*p) {
parent = *p;
@@ -172,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
p = &(*p)->rb_left;
} else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_insert_color_cached(&block_group->cache_node,
+ &info->block_group_cache_tree, leftmost);
- if (info->first_logical_byte > block_group->start)
- info->first_logical_byte = block_group->start;
-
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return 0;
}
@@ -201,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -224,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
- if (ret) {
+ if (ret)
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->start)
- info->first_logical_byte = ret->start;
- }
- spin_unlock(&info->block_group_cache_lock);
+ read_unlock(&info->block_group_cache_lock);
return ret;
}
@@ -258,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
- spin_lock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -275,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
struct btrfs_block_group *bg;
- bool ret = true;
+ bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
- return false;
+ return NULL;
spin_lock(&bg->lock);
if (bg->ro)
- ret = false;
+ can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
- /* No put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
+ if (!can_nocow) {
btrfs_put_block_group(bg);
+ return NULL;
+ }
- return ret;
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
}
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * @bg: The block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
- struct btrfs_block_group *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@@ -514,7 +543,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -529,6 +558,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+ extent_root = btrfs_extent_root(fs_info, last);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -762,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
@@ -841,7 +871,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- root = fs_info->extent_root;
+ root = btrfs_block_group_root(fs_info);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
@@ -947,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
- if (fs_info->first_logical_byte == block_group->start)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -982,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
@@ -996,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
@@ -1106,6 +1134,7 @@ out:
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -1139,8 +1168,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
num_items = 3 + map->num_stripes;
free_extent_map(em);
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items);
+ return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
/*
@@ -1357,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto next;
+ }
+
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@@ -1502,19 +1538,32 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
- LIST_HEAD(again_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
+ sb_start_write(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ sb_end_write(fs_info->sb);
return;
+ }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1522,6 +1571,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return;
}
@@ -1585,21 +1635,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret && ret != -EAGAIN)
+ if (ret)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
next:
+ btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
- if (ret == -EAGAIN && list_empty(&bg->bg_list))
- list_add_tail(&bg->bg_list, &again_list);
- else
- btrfs_put_block_group(bg);
}
- list_splice_tail(&again_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -1678,38 +1725,16 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
- struct extent_buffer *leaf;
- int slot;
-
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = read_bg_from_eb(fs_info, &found_key, path);
- break;
+ return read_bg_from_eb(fs_info, &found_key, path);
}
-
- path->slots[0]++;
}
-out:
return ret;
}
@@ -2001,6 +2026,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
set_free_space_tree_thresholds(cache);
@@ -2165,6 +2191,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
+ struct btrfs_root *root = btrfs_block_group_root(info);
struct btrfs_path *path;
int ret;
struct btrfs_block_group *cache;
@@ -2173,7 +2200,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!info->extent_root)
+ if (!root)
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2276,20 +2303,19 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_item bgi;
- struct btrfs_root *root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ block_group->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- root = fs_info->extent_root;
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
@@ -2439,6 +2465,27 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size)
@@ -2459,6 +2506,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
@@ -2468,12 +2517,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
- /*
- * New block group is likely to be used soon. Try to activate it now.
- * Failure is OK for now.
- */
- btrfs_zone_activate(cache);
-
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2543,12 +2586,26 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
int ret;
bool dirty_bg_running;
+ /*
+ * This can only happen when we are doing read-only scrub on read-only
+ * mount.
+ * In that case we should not start a new transaction on read-only fs.
+ * Thus here we skip all chunk allocations.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ ret = inc_block_group_ro(cache, 0);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ return ret;
+ }
+
do {
- trans = btrfs_join_transaction(fs_info->extent_root);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -2653,7 +2710,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
@@ -2674,7 +2731,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
@@ -2897,7 +2954,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2963,7 +3019,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -3064,7 +3119,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3122,7 +3176,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -3181,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
+static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3203,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ bool reclaim;
+
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@@ -3248,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -3274,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
@@ -3406,7 +3490,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3493,7 +3577,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
out:
btrfs_trans_release_chunk_metadata(trans);
- return ret;
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
}
/*
@@ -3608,10 +3696,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
+ bool from_extent_allocation = false;
int ret = 0;
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
@@ -3701,9 +3796,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
@@ -3790,7 +3898,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
}
if (!ret) {
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ ret = btrfs_block_rsv_add(fs_info,
&fs_info->chunk_block_rsv,
bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
@@ -3894,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@@ -3911,9 +4019,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
- spin_unlock(&info->unused_bgs_lock);
- spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->reclaim_bgs)) {
block_group = list_first_entry(&info->reclaim_bgs,
struct btrfs_block_group,
@@ -3933,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->zone_active_bgs_lock);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@@ -3963,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
@@ -3979,9 +4085,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
* important and indicates a real bug if this happens.
*/
if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on bytes_reserved > 0 in
+ * that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+ }
+
WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 5878b7ce3b78..35e0e860cc0b 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -35,11 +35,15 @@ enum btrfs_discard_state {
* the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
+ * find_free_extent() that also activaes the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
struct btrfs_caching_control {
@@ -68,6 +72,7 @@ struct btrfs_block_group {
u64 bytes_super;
u64 flags;
u64 cache_generation;
+ u64 global_root_id;
/*
* If the free space extent count exceeds this number, convert the block
@@ -99,6 +104,7 @@ struct btrfs_block_group {
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
unsigned int zone_is_active:1;
+ unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
@@ -207,6 +213,8 @@ struct btrfs_block_group {
u64 meta_write_pointer;
struct map_lookup *physical_map;
struct list_head active_bg_list;
+ struct work_struct zone_finish_work;
+ struct extent_buffer *last_eb;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -249,8 +257,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 04a6226e0388..b3ee49b0b1e8 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,6 +6,7 @@
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
+#include "disk-io.h"
/*
* HOW DO BLOCK RESERVES WORK
@@ -208,7 +209,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
@@ -217,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -241,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush)
{
@@ -262,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -351,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
- u64 num_bytes;
- unsigned min_items;
+ struct btrfs_root *root, *tmp;
+ u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
+ unsigned int min_items = 1;
/*
* The global block rsv is based on the size of the extent tree, the
* checksum tree and the root tree. If the fs is empty we want to set
* it to a minimal amount for safety.
+ *
+ * We also are going to need to modify the minimum of the tree root and
+ * any global roots we could touch.
*/
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
- btrfs_root_used(&fs_info->csum_root->root_item) +
- btrfs_root_used(&fs_info->tree_root->root_item);
-
- /*
- * We at a minimum are going to modify the csum root, the tree root, and
- * the extent root.
- */
- min_items = 3;
+ read_lock(&fs_info->global_root_lock);
+ rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
+ rb_node) {
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ num_bytes += btrfs_root_used(&root->root_item);
+ min_items++;
+ }
+ }
+ read_unlock(&fs_info->global_root_lock);
/*
* But we also want to reserve enough space so we can do the fallback
@@ -412,6 +419,30 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_unlock(&sinfo->lock);
}
+void btrfs_init_root_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ switch (root->root_key.objectid) {
+ case BTRFS_CSUM_TREE_OBJECTID:
+ case BTRFS_EXTENT_TREE_OBJECTID:
+ case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ root->block_rsv = &fs_info->delayed_refs_rsv;
+ break;
+ case BTRFS_ROOT_TREE_OBJECTID:
+ case BTRFS_DEV_TREE_OBJECTID:
+ case BTRFS_QUOTA_TREE_OBJECTID:
+ root->block_rsv = &fs_info->global_block_rsv;
+ break;
+ case BTRFS_CHUNK_TREE_OBJECTID:
+ root->block_rsv = &fs_info->chunk_block_rsv;
+ break;
+ default:
+ root->block_rsv = NULL;
+ break;
+ }
+}
+
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -426,22 +457,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
- /*
- * Our various recovery options can leave us with NULL roots, so check
- * here and just bail before we go dereferencing NULLs everywhere.
- */
- if (!fs_info->extent_root || !fs_info->csum_root ||
- !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
- return;
-
- fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
- if (fs_info->quota_root)
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
btrfs_update_global_block_rsv(fs_info);
}
@@ -467,8 +482,9 @@ static struct btrfs_block_rsv *get_block_rsv(
struct btrfs_block_rsv *block_rsv = NULL;
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- (root == fs_info->csum_root && trans->adding_csums) ||
- (root == fs_info->uuid_root))
+ (root == fs_info->uuid_root) ||
+ (trans->adding_csums &&
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -523,7 +539,7 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 0b6ae5302837..3b67ff08d434 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -50,6 +50,7 @@ struct btrfs_block_rsv {
};
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
@@ -57,11 +58,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ab2a4a52e0bb..33811e896623 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -14,6 +14,13 @@
#include "delayed-inode.h"
/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
+/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -138,19 +145,11 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- union {
- /*
- * Total number of bytes pending delalloc, used by stat to
- * calculate the real block usage of the file. This is used
- * only for files.
- */
- u64 delalloc_bytes;
- /*
- * The offset of the last dir item key that was logged.
- * This is used only for directories.
- */
- u64 last_dir_item_offset;
- };
+ /*
+ * Total number of bytes pending delalloc, used by stat to calculate the
+ * real block usage of the file. This is used only for files.
+ */
+ u64 delalloc_bytes;
union {
/*
@@ -181,8 +180,9 @@ struct btrfs_inode {
u64 disk_i_size;
/*
- * if this is a directory then index_cnt is the counter for the index
- * number for new files that are created
+ * If this is a directory then index_cnt is the counter for the index
+ * number for new files that are created. For an empty directory, this
+ * must be initialized to BTRFS_DIR_START_INDEX.
*/
u64 index_cnt;
@@ -341,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
spin_unlock(&inode->lock);
}
+/*
+ * Should be called while holding the inode's VFS lock in exclusive mode or in a
+ * context where no one else can access the inode concurrently (during inode
+ * creation or when loading an inode from disk).
+ */
+static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ /*
+ * The inode may have been part of a reflink operation in the last
+ * transaction that modified it, and then a fsync has reset the
+ * last_reflink_trans to avoid subsequent fsyncs in the same
+ * transaction to do unnecessary work. So update last_reflink_trans
+ * to the last_trans value (we have to be pessimistic and assume a
+ * reflink happened).
+ *
+ * The ->last_trans is protected by the inode's spinlock and we can
+ * have a concurrent ordered extent completion update it. Also set
+ * last_reflink_trans to ->last_trans only if the former is less than
+ * the later, because we can be called in a context where
+ * last_reflink_trans was set to the current transaction generation
+ * while ->last_trans was not yet updated in the current transaction,
+ * and therefore has a lower value.
+ */
+ spin_lock(&inode->lock);
+ if (inode->last_reflink_trans < inode->last_trans)
+ inode->last_reflink_trans = inode->last_trans;
+ spin_unlock(&inode->lock);
+}
+
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
bool ret = false;
@@ -354,30 +384,16 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
return ret;
}
-struct btrfs_dio_private {
- struct inode *inode;
-
- /*
- * Since DIO can use anonymous page, we cannot use page_offset() to
- * grab the file offset, thus need a dedicated member for file offset.
- */
- u64 file_offset;
- u64 disk_bytenr;
- /* Used for bio::bi_size */
- u32 bytes;
-
- /*
- * References to this structure. There is one reference per in-flight
- * bio plus one while we're still setting up.
- */
- refcount_t refs;
-
- /* dio_bio came from fs/direct-io.c */
- struct bio *dio_bio;
-
- /* Array of checksums */
- u8 csums[];
-};
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
/*
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7e9f90fa0388..5d20137b7b67 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -78,7 +78,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
@@ -1553,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- for (i = 0; i < num_pages; i++) {
- block_ctx->pagev[i] = alloc_page(GFP_NOFS);
- if (!block_ctx->pagev[i])
- return -1;
- }
+ ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
+ if (ret)
+ return ret;
dev_bytenr = block_ctx->dev_bytenr;
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
- bio = btrfs_bio_alloc(num_pages - i);
- bio_set_dev(bio, block_ctx->dev->bdev);
+ bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
+ REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
- bio->bi_opf = REQ_OP_READ;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -2034,7 +2030,7 @@ continue_loop:
static void btrfsic_bio_end_io(struct bio *bp)
{
- struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ struct btrfsic_block *block = bp->bi_private;
int iodone_w_error;
/* mutex is not held! This is not save if IO is not yet completed
@@ -2636,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
-static void __btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- struct btrfsic_dev_state *dev_state;
+ unsigned int segs = bio_segments(bio);
+ u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ u64 cur_bytenr = dev_bytenr;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ char **mapped_datav;
+ int bio_is_patched = 0;
+ int i = 0;
+
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info(
+"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, segs,
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
- if (!btrfsic_is_initialized)
+ mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
return;
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bio() is also called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- if (NULL != dev_state &&
- (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- int i = 0;
- u64 dev_bytenr;
- u64 cur_bytenr;
- struct bio_vec bvec;
- struct bvec_iter iter;
- int bio_is_patched;
- char **mapped_datav;
- unsigned int segs = bio_segments(bio);
-
- dev_bytenr = 512 * bio->bi_iter.bi_sector;
- bio_is_patched = 0;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs,
- sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- goto leave;
- cur_bytenr = dev_bytenr;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, segs,
- bio, &bio_is_patched,
- bio->bi_opf);
- kfree(mapped_datav);
- } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = page_address(bvec.bv_page);
+ i++;
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- }
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
}
-leave:
- mutex_unlock(&btrfsic_mutex);
+
+ btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
+ bio, &bio_is_patched, bio->bi_opf);
+ kfree(mapped_datav);
}
-void btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- __btrfsic_submit_bio(bio);
- submit_bio(bio);
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
+
+ if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = bio->bi_opf;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ } else if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE))) {
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
+ }
}
-int btrfsic_submit_bio_wait(struct bio *bio)
+void btrfsic_check_bio(struct bio *bio)
{
- __btrfsic_submit_bio(bio);
- return submit_bio_wait(bio);
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ /*
+ * We can be called before btrfsic_mount, so there might not be a
+ * dev_state.
+ */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
+ mutex_lock(&btrfsic_mutex);
+ if (dev_state) {
+ if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
+ btrfsic_check_write_bio(bio, dev_state);
+ else if (bio->bi_opf & REQ_PREFLUSH)
+ btrfsic_check_flush_bio(bio, dev_state);
+ }
+ mutex_unlock(&btrfsic_mutex);
}
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index bcc730a06cb5..e4c8aed7996f 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_submit_bio(struct bio *bio);
-int btrfsic_submit_bio_wait(struct bio *bio);
+void btrfsic_check_bio(struct bio *bio);
#else
-#define btrfsic_submit_bio submit_bio
-#define btrfsic_submit_bio_wait submit_bio_wait
+static inline void btrfsic_check_bio(struct bio *bio) { }
#endif
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 32da97c3c19d..f4564f32f6d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -96,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws,
}
}
-static int compression_decompress_bio(int type, struct list_head *ws,
- struct compressed_bio *cb)
+static int compression_decompress_bio(struct list_head *ws,
+ struct compressed_bio *cb)
{
- switch (type) {
+ switch (cb->compress_type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
@@ -157,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
- if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
+ if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return 0;
shash->tfm = fs_info->csum_shash;
@@ -218,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
bi_size += bvec->bv_len;
if (bio->bi_status)
- cb->errors = 1;
+ cb->status = bio->bi_status;
ASSERT(bi_size && bi_size <= cb->compressed_len);
last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
@@ -233,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
return last_io;
}
-static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+static void finish_compressed_bio_read(struct compressed_bio *cb)
{
unsigned int index;
struct page *page;
@@ -246,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi
}
/* Do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
+ if (cb->status != BLK_STS_OK) {
+ cb->orig_bio->bi_status = cb->status;
+ bio_endio(cb->orig_bio);
} else {
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
- ASSERT(bio);
- ASSERT(!bio->bi_status);
/*
* We have verified the checksum already, set page checked so
* the end_io handlers know about it
*/
- ASSERT(!bio_flagged(bio, BIO_CLONED));
+ ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
u64 bvec_start = page_offset(bvec->bv_page) +
bvec->bv_offset;
@@ -307,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio)
* Some IO in this cb have failed, just skip checksum as there
* is no way it could be correct.
*/
- if (cb->errors == 1)
+ if (cb->status != BLK_STS_OK)
goto csum_failed;
inode = cb->inode;
@@ -323,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
- cb->errors = 1;
- finish_compressed_bio_read(cb, bio);
+ cb->status = errno_to_blk_status(ret);
+ finish_compressed_bio_read(cb);
out:
bio_put(bio);
}
@@ -341,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode,
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
+ const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
- if (cb->errors)
- mapping_set_error(inode->i_mapping, -EIO);
+ if (errno)
+ mapping_set_error(inode->i_mapping, errno);
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -357,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
continue;
}
for (i = 0; i < ret; i++) {
- if (cb->errors)
+ if (errno)
SetPageError(pages[i]);
btrfs_page_clamp_clear_writeback(fs_info, pages[i],
cb->start, cb->len);
@@ -380,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
*/
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- !cb->errors);
+ cb->status == BLK_STS_OK);
- end_compressed_writeback(inode, cb);
+ if (cb->writeback)
+ end_compressed_writeback(inode, cb);
/* Note, our inode could be gone now */
/*
@@ -423,7 +425,6 @@ out:
}
static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct compressed_bio *cb,
struct bio *bio, int mirror_num)
{
blk_status_t ret;
@@ -505,7 +506,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css)
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
@@ -523,16 +525,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (!cb)
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
- cb->errors = 0;
+ cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
+ cb->writeback = writeback;
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
+ if (blkcg_css)
+ kthread_associate_blkcg(blkcg_css);
+
while (cur_disk_bytenr < disk_start + compressed_len) {
u64 offset = cur_disk_bytenr - disk_start;
unsigned int index = offset >> PAGE_SHIFT;
@@ -551,6 +557,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bio = NULL;
goto finish_cb;
}
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
}
/*
* We should never reach next_stripe_start start as we will
@@ -590,12 +598,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (submit) {
if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
+ ret = btrfs_csum_one_bio(inode, bio, start, true);
if (ret)
goto finish_cb;
}
- ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ ret = submit_compressed_bio(fs_info, bio, 0);
if (ret)
goto finish_cb;
bio = NULL;
@@ -608,6 +616,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
return 0;
finish_cb:
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
if (bio) {
bio->bi_status = ret;
bio_endio(bio);
@@ -790,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
- unsigned int nr_pages;
- unsigned int pg_index;
struct bio *comp_bio = NULL;
const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_byte = disk_bytenr;
@@ -807,8 +816,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
- blk_status_t ret = BLK_STS_RESOURCE;
- int faili = 0;
+ blk_status_t ret;
+ int ret2;
+ int i;
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -820,17 +830,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
- if (!em)
- return BLK_STS_IOERR;
+ if (!em) {
+ ret = BLK_STS_IOERR;
+ goto out;
+ }
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
- if (!cb)
+ if (!cb) {
+ ret = BLK_STS_RESOURCE;
goto out;
+ }
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
- cb->errors = 0;
+ cb->status = BLK_STS_OK;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = cb->sums;
@@ -839,30 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_len = em->len;
em_start = em->start;
- free_extent_map(em);
- em = NULL;
-
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_compress_type(bio_flags);
+ cb->compress_type = em->compress_type;
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_NOFS);
- if (!cb->compressed_pages)
- goto fail1;
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
- if (!cb->compressed_pages[pg_index]) {
- faili = pg_index - 1;
- ret = BLK_STS_RESOURCE;
- goto fail2;
- }
+ free_extent_map(em);
+ em = NULL;
+
+ cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!cb->compressed_pages) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
+ }
+
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ if (ret2) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
}
- faili = nr_pages - 1;
- cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -931,26 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
if (ret)
goto finish_cb;
comp_bio = NULL;
}
}
- return 0;
+ return;
-fail2:
- while (faili >= 0) {
- __free_page(cb->compressed_pages[faili]);
- faili--;
+fail:
+ if (cb->compressed_pages) {
+ for (i = 0; i < cb->nr_pages; i++) {
+ if (cb->compressed_pages[i])
+ __free_page(cb->compressed_pages[i]);
+ }
}
kfree(cb->compressed_pages);
-fail1:
kfree(cb);
out:
free_extent_map(em);
- return ret;
+ bio->bi_status = ret;
+ bio_endio(bio);
+ return;
finish_cb:
if (comp_bio) {
comp_bio->bi_status = ret;
@@ -958,7 +971,7 @@ finish_cb:
}
/* All bytes of @cb is submitted, endio will free @cb */
if (cur_disk_byte == disk_bytenr + compressed_len)
- return ret;
+ return;
wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
(disk_bytenr + compressed_len - cur_disk_byte) >>
@@ -969,8 +982,7 @@ finish_cb:
*/
ASSERT(refcount_read(&cb->pending_sectors));
/* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_read(cb, NULL);
- return ret;
+ finish_compressed_bio_read(cb);
}
/*
@@ -1359,7 +1371,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = compression_decompress_bio(type, workspace, cb);
+ ret = compression_decompress_bio(workspace, cb);
put_workspace(type, workspace);
return ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 56eef0821e3e..2707404389a5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -22,6 +22,8 @@ struct btrfs_inode;
/* Maximum length of compressed data stored on disk */
#define BTRFS_MAX_COMPRESSED (SZ_128K)
+static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
@@ -52,8 +54,11 @@ struct compressed_bio {
/* The compression algorithm for this bio */
u8 compress_type;
+ /* Whether this is a write for writeback. */
+ bool writeback;
+
/* IO errors */
- u8 errors;
+ blk_status_t status;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
@@ -95,9 +100,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css);
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback);
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3983bdaf4b8..6e556031a8f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,6 +16,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
+#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans, buf,
- new_flags, level, 0);
+ new_flags, level);
if (ret)
return ret;
}
@@ -463,8 +464,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -485,8 +486,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
}
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -726,21 +727,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
}
/*
- * search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart.
+ * Search for a key in the given extent_buffer.
*
- * the slot in the array is returned via slot, and it points to
- * the place where you would insert key if it is not found in
- * the array.
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
*
- * Slot may point to total number of items if the key is bigger than
- * all of the keys
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
*/
-static noinline int generic_bin_search(struct extent_buffer *eb,
- unsigned long p, int item_size,
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
const struct btrfs_key *key, int *slot)
{
- int low = 0;
+ unsigned long p;
+ int item_size;
int high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
@@ -753,6 +756,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
return -EINVAL;
}
+ if (btrfs_header_level(eb) == 0) {
+ p = offsetof(struct btrfs_leaf, items);
+ item_size = sizeof(struct btrfs_item);
+ } else {
+ p = offsetof(struct btrfs_node, ptrs);
+ item_size = sizeof(struct btrfs_key_ptr);
+ }
+
while (low < high) {
unsigned long oip;
unsigned long offset;
@@ -791,20 +802,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
}
/*
- * simple bin_search frontend that does the right thing for
- * leaves vs nodes
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot)
{
- if (btrfs_header_level(eb) == 0)
- return generic_bin_search(eb,
- offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item), key, slot);
- else
- return generic_bin_search(eb,
- offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr), key, slot);
+ return generic_bin_search(eb, 0, key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -843,9 +847,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
- if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- eb = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
return eb;
@@ -927,7 +933,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -986,7 +992,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+ 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
@@ -1031,7 +1038,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
@@ -1345,33 +1352,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
{
int i;
int skip_level = level;
- int no_skips = 0;
- struct extent_buffer *t;
+ bool check_skip = true;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
if (!path->locks[i])
break;
- if (!no_skips && path->slots[i] == 0) {
- skip_level = i + 1;
- continue;
- }
- if (!no_skips && path->keep_locks) {
- u32 nritems;
- t = path->nodes[i];
- nritems = btrfs_header_nritems(t);
- if (nritems < 1 || path->slots[i] >= nritems - 1) {
+
+ if (check_skip) {
+ if (path->slots[i] == 0) {
skip_level = i + 1;
continue;
}
+
+ if (path->keep_locks) {
+ u32 nritems;
+
+ nritems = btrfs_header_nritems(path->nodes[i]);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
}
- if (skip_level < i && i >= lowest_unlock)
- no_skips = 1;
- t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level) {
- btrfs_tree_unlock_rw(t, path->locks[i]);
+ check_skip = false;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -1383,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * helper function for btrfs_search_slot. The goal is to find a block
- * in cache without setting the path to blocking. If we find the block
- * we return zero and the path is unchanged.
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
*
- * If we can't find the block, we set the path blocking and do some
- * reada. -EAGAIN is returned and the search must be repeated.
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@@ -1402,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_key first_key;
int ret;
int parent_level;
+ bool unlock_up;
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
@@ -1429,47 +1447,61 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
+
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
- if (!ret) {
- *eb_ret = tmp;
- return 0;
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
}
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read. Don't release the lock on the current
- * level because we need to walk this node to figure
- * out which blocks to read.
- */
- btrfs_unlock_up_safe(p, level + 1);
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
- if (!IS_ERR(tmp)) {
- /*
- * If the read above didn't mark this buffer up to date,
- * it will never end up being up to date. Set ret to EIO now
- * and give up so that our caller doesn't loop forever
- * on our EAGAINs.
- */
- if (!extent_buffer_uptodate(tmp))
- ret = -EIO;
- free_extent_buffer(tmp);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
+ }
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
} else {
- ret = PTR_ERR(tmp);
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
}
- btrfs_release_path(p);
return ret;
}
@@ -1567,35 +1599,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
- /*
- * The commit roots are read only so we always do read locks,
- * and we always must hold the commit_root_sem when doing
- * searches on them, the only exception is send where we don't
- * want to block transaction commits for a long time, so
- * we need to clone the commit root in order to avoid races
- * with transaction commits that create a snapshot of one of
- * the roots used by a send operation.
- */
- if (p->need_commit_sem) {
- down_read(&fs_info->commit_root_sem);
- b = btrfs_clone_extent_buffer(root->commit_root);
- up_read(&fs_info->commit_root_sem);
- if (!b)
- return ERR_PTR(-ENOMEM);
-
- } else {
- b = root->commit_root;
- atomic_inc(&b->refs);
- }
+ b = root->commit_root;
+ atomic_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -1612,6 +1622,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -1638,6 +1651,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -1647,6 +1671,191 @@ out:
return b;
}
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+ const int i = path->lowest_level;
+ const int slot = path->slots[i];
+ struct extent_buffer *lowest = path->nodes[i];
+ struct extent_buffer *clone;
+
+ ASSERT(path->need_commit_sem);
+
+ if (!lowest)
+ return 0;
+
+ lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+ clone = btrfs_clone_extent_buffer(lowest);
+ if (!clone)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+ path->nodes[i] = clone;
+ path->slots[i] = slot;
+
+ return 0;
+}
+
+static inline int search_for_key_slot(struct extent_buffer *eb,
+ int search_low_slot,
+ const struct btrfs_key *key,
+ int prev_cmp,
+ int *slot)
+{
+ /*
+ * If a previous call to btrfs_bin_search() on a parent node returned an
+ * exact match (prev_cmp == 0), we can safely assume the target key will
+ * always be at slot 0 on lower levels, since each key pointer
+ * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+ * subtree it points to. Thus we can skip searching lower levels.
+ */
+ if (prev_cmp == 0) {
+ *slot = 0;
+ return 0;
+ }
+
+ return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
+static int search_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *path,
+ int ins_len,
+ int prev_cmp)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int leaf_free_space = -1;
+ int search_low_slot = 0;
+ int ret;
+ bool do_bin_search = true;
+
+ /*
+ * If we are doing an insertion, the leaf has enough free space and the
+ * destination slot for the key is not slot 0, then we can unlock our
+ * write lock on the parent, and any other upper nodes, before doing the
+ * binary search on the leaf (with search_for_key_slot()), allowing other
+ * tasks to lock the parent and any other upper nodes.
+ */
+ if (ins_len > 0) {
+ /*
+ * Cache the leaf free space, since we will need it later and it
+ * will not change until then.
+ */
+ leaf_free_space = btrfs_leaf_free_space(leaf);
+
+ /*
+ * !path->locks[1] means we have a single node tree, the leaf is
+ * the root of the tree.
+ */
+ if (path->locks[1] && leaf_free_space >= ins_len) {
+ struct btrfs_disk_key first_key;
+
+ ASSERT(btrfs_header_nritems(leaf) > 0);
+ btrfs_item_key(leaf, &first_key, 0);
+
+ /*
+ * Doing the extra comparison with the first key is cheap,
+ * taking into account that the first key is very likely
+ * already in a cache line because it immediately follows
+ * the extent buffer's header and we have recently accessed
+ * the header's level field.
+ */
+ ret = comp_keys(&first_key, key);
+ if (ret < 0) {
+ /*
+ * The first key is smaller than the key we want
+ * to insert, so we are safe to unlock all upper
+ * nodes and we have to do the binary search.
+ *
+ * We do use btrfs_unlock_up_safe() and not
+ * unlock_up() because the later does not unlock
+ * nodes with a slot of 0 - we can safely unlock
+ * any node even if its slot is 0 since in this
+ * case the key does not end up at slot 0 of the
+ * leaf and there's no need to split the leaf.
+ */
+ btrfs_unlock_up_safe(path, 1);
+ search_low_slot = 1;
+ } else {
+ /*
+ * The first key is >= then the key we want to
+ * insert, so we can skip the binary search as
+ * the target key will be at slot 0.
+ *
+ * We can not unlock upper nodes when the key is
+ * less than the first key, because we will need
+ * to update the key at slot 0 of the parent node
+ * and possibly of other upper nodes too.
+ * If the key matches the first key, then we can
+ * unlock all the upper nodes, using
+ * btrfs_unlock_up_safe() instead of unlock_up()
+ * as stated above.
+ */
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
+ /*
+ * ret is already 0 or 1, matching the result of
+ * a btrfs_bin_search() call, so there is no need
+ * to adjust it.
+ */
+ do_bin_search = false;
+ path->slots[0] = 0;
+ }
+ }
+ }
+
+ if (do_bin_search) {
+ ret = search_for_key_slot(leaf, search_low_slot, key,
+ prev_cmp, &path->slots[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (ins_len > 0) {
+ /*
+ * Item key already exists. In this case, if we are allowed to
+ * insert the item (for example, in dir_item case, item key
+ * collision is allowed), it will be merged with the original
+ * item. Only the item size grows, no new btrfs item will be
+ * added. If search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf space
+ * check will be correct.
+ */
+ if (ret == 0 && !path->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
+
+ ASSERT(leaf_free_space >= 0);
+
+ if (leaf_free_space < ins_len) {
+ int err;
+
+ err = split_leaf(trans, root, key, path, ins_len,
+ (ret == 0));
+ ASSERT(err <= 0);
+ if (WARN_ON(err > 0))
+ err = -EUCLEAN;
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
@@ -1683,6 +1892,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -1724,6 +1934,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
min_write_lock_level = write_lock_level;
+ if (p->need_commit_sem) {
+ ASSERT(p->search_commit_root);
+ down_read(&fs_info->commit_root_sem);
+ }
+
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
@@ -1777,10 +1992,6 @@ again:
}
cow_done:
p->nodes[level] = b;
- /*
- * Leave path with blocking locks to avoid massive
- * lock context switch, this is made on purpose.
- */
/*
* we have a lock on b and as long as we aren't changing
@@ -1802,62 +2013,22 @@ cow_done:
}
}
- /*
- * If btrfs_bin_search returns an exact match (prev_cmp == 0)
- * we can safely assume the target key will always be in slot 0
- * on lower levels due to the invariants BTRFS' btree provides,
- * namely that a btrfs_key_ptr entry always points to the
- * lowest key in the child node, thus we can skip searching
- * lower levels
- */
- if (prev_cmp == 0) {
- slot = 0;
- ret = 0;
- } else {
- ret = btrfs_bin_search(b, key, &slot);
- prev_cmp = ret;
- if (ret < 0)
- goto done;
- }
-
if (level == 0) {
- p->slots[level] = slot;
- /*
- * Item key already exists. In this case, if we are
- * allowed to insert the item (for example, in dir_item
- * case, item key collision is allowed), it will be
- * merged with the original item. Only the item size
- * grows, no new btrfs item will be added. If
- * search_for_extension is not set, ins_len already
- * accounts the size btrfs_item, deduct it here so leaf
- * space check will be correct.
- */
- if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
- ASSERT(ins_len >= sizeof(struct btrfs_item));
- ins_len -= sizeof(struct btrfs_item);
- }
- if (ins_len > 0 &&
- btrfs_leaf_free_space(b) < ins_len) {
- if (write_lock_level < 1) {
- write_lock_level = 1;
- btrfs_release_path(p);
- goto again;
- }
-
- err = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
+ if (ins_len > 0)
+ ASSERT(write_lock_level >= 1);
- BUG_ON(err > 0);
- if (err) {
- ret = err;
- goto done;
- }
- }
+ ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL);
goto done;
}
+
+ ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
+ prev_cmp = ret;
+
if (ret && slot > 0) {
dec = 1;
slot--;
@@ -1918,6 +2089,16 @@ cow_done:
done:
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
+
+ if (p->need_commit_sem) {
+ int ret2;
+
+ ret2 = finish_need_commit_sem_search(p);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
@@ -2123,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
@@ -2615,19 +2833,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
- struct btrfs_item *start_item;
- struct btrfs_item *end_item;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- start_item = btrfs_item_nr(start);
- end_item = btrfs_item_nr(end);
- data_len = btrfs_item_offset(l, start_item) +
- btrfs_item_size(l, start_item);
- data_len = data_len - btrfs_item_offset(l, end_item);
+ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+ data_len = data_len - btrfs_item_offset(l, end);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -2674,7 +2887,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
u32 i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 nr;
u32 right_nritems;
u32 data_end;
@@ -2691,8 +2903,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] > i)
break;
@@ -2707,12 +2917,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(left, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(left, i);
+ if (this_item_size + sizeof(struct btrfs_item) +
+ push_space > free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
if (i == 0)
break;
i--;
@@ -2726,7 +2937,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* push left to right */
right_nritems = btrfs_header_nritems(right);
- push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space = btrfs_item_data_end(left, left_nritems - push_items);
push_space -= leaf_data_end(left);
/* make room in the right data area */
@@ -2757,9 +2968,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(&token, item);
- btrfs_set_token_item_offset(&token, item, push_space);
+ push_space -= btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
left_nritems -= push_items;
@@ -2844,16 +3054,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (free_space < data_size)
goto out_unlock;
- /* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(right);
- if (free_space < data_size)
- goto out_unlock;
-
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
@@ -2904,7 +3109,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 old_left_nritems;
u32 nr;
int ret = 0;
@@ -2918,8 +3122,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] < i)
break;
@@ -2934,12 +3136,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(right, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(right, i);
+ if (this_item_size + sizeof(struct btrfs_item) + push_space >
+ free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
}
if (push_items == 0) {
@@ -2955,25 +3158,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
push_items * sizeof(struct btrfs_item));
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
- btrfs_item_offset_nr(right, push_items - 1);
+ btrfs_item_offset(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset_nr(right, push_items - 1),
+ btrfs_item_offset(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
btrfs_init_map_token(&token, left);
- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
-
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item,
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2984,7 +3185,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
right_nritems);
if (push_items < right_nritems) {
- push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
@@ -3002,10 +3203,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
-
- push_space = push_space - btrfs_token_item_size(&token, item);
- btrfs_set_token_item_offset(&token, item, push_space);
+ push_space = push_space - btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -3084,7 +3283,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- /* cow and double check */
ret = btrfs_cow_block(trans, root, left,
path->nodes[1], slot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -3095,12 +3293,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
if (check_sibling_keys(left, right)) {
ret = -EUCLEAN;
goto out;
@@ -3133,7 +3325,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
+ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
@@ -3144,15 +3336,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(l), data_copy_size);
- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -3268,7 +3459,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ if (extend && data_size + btrfs_item_size(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
@@ -3437,7 +3628,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -3457,7 +3648,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
leaf = path->nodes[0];
/* if our item isn't there, return now */
- if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ if (item_size != btrfs_item_size(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
@@ -3488,9 +3679,7 @@ static noinline int split_item(struct btrfs_path *path,
unsigned long split_offset)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
- struct btrfs_item *new_item;
- int slot;
+ int orig_slot, slot;
char *buf;
u32 nritems;
u32 item_size;
@@ -3500,9 +3689,9 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- item = btrfs_item_nr(path->slots[0]);
- orig_offset = btrfs_item_offset(leaf, item);
- item_size = btrfs_item_size(leaf, item);
+ orig_slot = path->slots[0];
+ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
buf = kmalloc(item_size, GFP_NOFS);
if (!buf)
@@ -3523,14 +3712,12 @@ static noinline int split_item(struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
- new_item = btrfs_item_nr(slot);
-
- btrfs_set_item_offset(leaf, new_item, orig_offset);
- btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+ btrfs_set_item_offset(leaf, slot, orig_offset);
+ btrfs_set_item_size(leaf, slot, item_size - split_offset);
- btrfs_set_item_offset(leaf, item,
- orig_offset + item_size - split_offset);
- btrfs_set_item_size(leaf, item, split_offset);
+ btrfs_set_item_offset(leaf, orig_slot,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, orig_slot, split_offset);
btrfs_set_header_nritems(leaf, nritems + 1);
@@ -3591,7 +3778,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data_start;
@@ -3603,14 +3789,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
leaf = path->nodes[0];
slot = path->slots[0];
- old_size = btrfs_item_size_nr(leaf, slot);
+ old_size = btrfs_item_size(leaf, slot);
if (old_size == new_size)
return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- old_data_start = btrfs_item_offset_nr(leaf, slot);
+ old_data_start = btrfs_item_offset(leaf, slot);
size_diff = old_size - new_size;
@@ -3624,10 +3810,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + size_diff);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
}
/* shift the data */
@@ -3670,8 +3855,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
fixup_low_keys(path, &disk_key, 1);
}
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, new_size);
+ btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3687,7 +3871,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data;
@@ -3705,7 +3888,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
BUG();
}
slot = path->slots[0];
- old_data = btrfs_item_end_nr(leaf, slot);
+ old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
if (slot >= nritems) {
@@ -3722,10 +3905,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff - data_size);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff - data_size);
}
/* shift the data */
@@ -3734,9 +3916,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end, old_data - data_end);
data_end = old_data;
- old_size = btrfs_item_size_nr(leaf, slot);
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, old_size + data_size);
+ old_size = btrfs_item_size(leaf, slot);
+ btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3758,7 +3939,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
@@ -3795,7 +3975,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+ unsigned int old_data = btrfs_item_data_end(leaf, slot);
if (old_data < data_end) {
btrfs_print_leaf(leaf);
@@ -3811,10 +3991,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item,
- ioff - batch->total_data_size);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - batch->total_data_size);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
@@ -3833,10 +4012,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
for (i = 0; i < batch->nr; i++) {
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(slot + i);
data_end -= batch->data_sizes[i];
- btrfs_set_token_item_offset(&token, item, data_end);
- btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
+ btrfs_set_token_item_offset(&token, slot + i, data_end);
+ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
@@ -3943,7 +4121,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
u32 item_size;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ret = setup_leaf_for_split(trans, root, path,
item_size + sizeof(struct btrfs_item));
if (ret)
@@ -4032,7 +4210,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4044,25 +4222,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- struct btrfs_item *item;
- u32 last_off;
- u32 dsize = 0;
int ret = 0;
int wret;
- int i;
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
-
- for (i = 0; i < nr; i++)
- dsize += btrfs_item_size_nr(leaf, slot + i);
-
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(leaf);
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
@@ -4073,9 +4248,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + dsize);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -4103,24 +4277,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fixup_low_keys(path, &disk_key, 1);
}
- /* delete the leaf if it is mostly empty */
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
-
- wret = push_leaf_left(trans, root, path, 1, 1,
- 1, (u32)-1);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1,
- 1, 1, 0);
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
@@ -4402,7 +4602,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int level;
struct extent_buffer *c;
struct extent_buffer *next;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
+ bool need_commit_sem = false;
u32 nritems;
int ret;
int i;
@@ -4419,14 +4621,20 @@ again:
path->keep_locks = 1;
- if (time_seq)
+ if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
- else
+ } else {
+ if (path->need_commit_sem) {
+ path->need_commit_sem = 0;
+ need_commit_sem = true;
+ down_read(&fs_info->commit_root_sem);
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ }
path->keep_locks = 0;
if (ret < 0)
- return ret;
+ goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
@@ -4549,6 +4757,15 @@ again:
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
+ if (need_commit_sem) {
+ int ret2;
+
+ path->need_commit_sem = 1;
+ ret2 = finish_need_commit_sem_search(path);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7553e9dc5f93..415bf1823fb3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -143,6 +144,13 @@ enum {
BTRFS_FS_STATE_DEV_REPLACING,
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
};
#define BTRFS_BACKREF_REV_MAX 256
@@ -269,8 +277,14 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
+ /* Extent tree v2 */
+ __le64 block_group_root;
+ __le64 block_group_root_generation;
+ u8 block_group_root_level;
+
/* future expansion */
- __le64 reserved[28];
+ u8 reserved8[7];
+ __le64 reserved[25];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -295,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -309,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -511,11 +546,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-enum btrfs_orphan_cleanup_state {
- ORPHAN_CLEANUP_STARTED = 1,
- ORPHAN_CLEANUP_DONE = 2,
-};
-
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
@@ -553,7 +583,6 @@ struct btrfs_swapfile_pin {
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
enum {
- BTRFS_FS_BARRIER,
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
BTRFS_FS_LOG_RECOVERING,
@@ -576,7 +605,6 @@ enum {
/*
* Indicate that relocation of a chunk has started, it's set per chunk
* and is toggled between chunks.
- * Set, tested and cleared while holding fs_info::send_reloc_lock.
*/
BTRFS_FS_RELOC_RUNNING,
@@ -601,6 +629,12 @@ enum {
/* Indicate whether there are any tree modification log users */
BTRFS_FS_TREE_MOD_LOG_USERS,
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -613,6 +647,7 @@ enum {
*/
enum btrfs_exclusive_operation {
BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
BTRFS_EXCLOP_BALANCE,
BTRFS_EXCLOP_DEV_ADD,
BTRFS_EXCLOP_DEV_REMOVE,
@@ -624,27 +659,29 @@ enum btrfs_exclusive_operation {
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
- struct btrfs_root *extent_root;
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
- struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
- struct btrfs_root *free_space_root;
struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
- spinlock_t fs_roots_radix_lock;
- struct radix_tree_root fs_roots_radix;
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
+ /* The xarray that holds all the FS roots */
+ spinlock_t fs_roots_lock;
+ struct xarray fs_roots;
/* block group cache stuff */
- spinlock_t block_group_cache_lock;
- u64 first_logical_byte;
- struct rb_root block_group_cache_tree;
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
/* keep track of unallocated space */
atomic64_t free_chunk_space;
@@ -673,6 +710,12 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
u64 avg_delayed_ref_runtime;
/*
@@ -805,17 +848,17 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
- struct btrfs_workqueue *rmw_workers;
+ struct workqueue_struct *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
- struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@@ -904,9 +947,9 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- struct btrfs_workqueue *scrub_workers;
- struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_parity_workers;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -952,17 +995,10 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* readahead tree */
- spinlock_t reada_lock;
- struct radix_tree_root reada_tree;
-
- /* readahead works cnt */
- atomic_t reada_works_cnt;
-
- /* Extent buffer radix tree */
+ /* Extent buffer xarray */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
+ struct xarray extent_buffers;
/* next backup root to be overwritten */
int backup_root_index;
@@ -1003,13 +1039,6 @@ struct btrfs_fs_info {
struct crypto_shash *csum_shash;
- spinlock_t send_reloc_lock;
- /*
- * Number of send operations in progress.
- * Updated while holding fs_info::send_reloc_lock.
- */
- int send_in_progress;
-
/* Type of exclusive operation running, protected by super_lock */
enum btrfs_exclusive_operation exclusive_operation;
@@ -1017,10 +1046,7 @@ struct btrfs_fs_info {
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
* if the mode is enabled
*/
- union {
- u64 zone_size;
- u64 zoned;
- };
+ u64 zone_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
@@ -1032,6 +1058,9 @@ struct btrfs_fs_info {
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
+
+ u64 nr_global_roots;
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
@@ -1090,7 +1119,8 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
- BTRFS_ROOT_IN_RADIX,
+ /* The root is tracked in fs_info::fs_roots */
+ BTRFS_ROOT_REGISTERED,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@@ -1110,8 +1140,17 @@ enum {
BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING,
+ /* We started the orphan cleanup for this root. */
+ BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
};
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
@@ -1128,6 +1167,8 @@ struct btrfs_qgroup_swapped_blocks {
* and for the extent tree extent_root root.
*/
struct btrfs_root {
+ struct rb_node rb_node;
+
struct extent_buffer *node;
struct extent_buffer *commit_root;
@@ -1178,17 +1219,15 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
- int orphan_cleanup_state;
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by inode_lock
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -1291,6 +1330,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
@@ -1603,25 +1644,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
type *s, u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
@@ -1652,8 +1693,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
total_bytes));
}
@@ -1661,8 +1702,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}
@@ -1960,8 +2001,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
}
/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
@@ -1976,25 +2017,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
- struct btrfs_item *item)
-{
- return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
+ int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
}
-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_end(eb, btrfs_item_nr(nr));
-}
-
-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_offset(eb, btrfs_item_nr(nr));
-}
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
{
- return btrfs_item_size(eb, btrfs_item_nr(nr));
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
}
static inline void btrfs_item_key(const struct extent_buffer *eb,
@@ -2257,6 +2309,11 @@ static inline bool btrfs_root_dead(const struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2306,6 +2363,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
+/*
+ * For extent tree v2 we overload the extent root with the block group root, as
+ * we will have multiple extent roots.
+ */
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
+ struct btrfs_root_backup, extent_root_level, 8);
+
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2440,6 +2508,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
+ block_group_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
+ struct btrfs_super_block,
+ block_group_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
+ block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2458,7 +2533,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
if (nr == 0)
return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
- return btrfs_item_offset_nr(leaf, nr - 1);
+ return btrfs_item_offset(leaf, nr - 1);
}
/* struct btrfs_file_extent_item */
@@ -2517,9 +2592,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
*/
static inline u32 btrfs_file_extent_inline_item_len(
const struct extent_buffer *eb,
- struct btrfs_item *e)
+ int nr)
{
- return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* btrfs_qgroup_status_item */
@@ -2611,11 +2686,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
#define btrfs_item_ptr_offset(leaf, slot) \
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
@@ -2710,7 +2785,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict);
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2719,7 +2795,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 empty_size,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -2737,8 +2813,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags,
- int level, int is_data);
+ struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
@@ -2817,7 +2892,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -2964,6 +3040,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path);
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+/*
+ * Search in @root for a given @key, and store the slot found in @found_key.
+ *
+ * @root: The root node of the tree.
+ * @key: The key we are looking for.
+ * @found_key: Will hold the found item.
+ * @path: Holds the current slot/leaf.
+ * @iter_ret: Contains the value returned from btrfs_search_slot or
+ * btrfs_get_next_valid_item, whichever was executed last.
+ *
+ * The @iter_ret is an output variable that will contain the return value of
+ * btrfs_search_slot, if it encountered an error, or the value returned from
+ * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
+ * slot was found, 1 if there were no more leaves, and <0 if there was an error.
+ *
+ * It's recommended to use a separate variable for iter_ret and then use it to
+ * set the function return value so there's no confusion of the 0/1/errno
+ * values stemming from btrfs_search_slot.
+ */
+#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
+ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
+ (iter_ret) >= 0 && \
+ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
+ (path)->slots[0]++ \
+ )
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -3114,38 +3219,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-item.c */
-int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 index);
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path,
- struct btrfs_key *location, int mod);
-
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
-
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len);
-struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len);
/* file-item.c */
-struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
@@ -3163,7 +3237,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig);
+ u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -3179,8 +3253,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
@@ -3203,10 +3277,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int front);
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode, u64 new_size,
- u32 min_type, u64 *extents_found);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -3214,10 +3284,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns);
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /*
+ * Output from btrfs_new_inode_prepare(), input to
+ * btrfs_create_new_inode().
+ */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+};
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3228,7 +3316,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -3268,9 +3355,14 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
+
extern const struct dentry_operations btrfs_dentry_operations;
-extern const struct iomap_ops btrfs_dio_iomap_ops;
-extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
@@ -3282,6 +3374,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
const u64 add_bytes,
const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3305,12 +3398,15 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
/* file.c */
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
+ struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
@@ -3327,6 +3423,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
@@ -3352,11 +3450,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
-#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...) \
+do { \
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
+ _btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
__printf(2, 3)
__cold
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
#else
+
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
@@ -3607,14 +3723,30 @@ do { \
__LINE__, (errno)); \
} while (0)
+#ifdef CONFIG_PRINTK_INDEX
+
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
+do { \
+ printk_index_subsys_emit( \
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
+ KERN_CRIT, fmt); \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
} while (0)
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+#endif
+
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
__printf(5, 6)
__cold
@@ -3762,15 +3894,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
-static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return 0;
+ return -EOPNOTSUPP;
}
#endif
@@ -3780,7 +3913,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
@@ -3821,23 +3954,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
btrfs_bio_counter_sub(fs_info, 1);
}
-/* reada.c */
-struct reada_control {
- struct btrfs_fs_info *fs_info; /* tree to prefetch */
- struct btrfs_key key_start;
- struct btrfs_key key_end; /* exclusive */
- atomic_t elems;
- struct kref refcnt;
- wait_queue_head_t wait;
-};
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *start, struct btrfs_key *end);
-int btrfs_reada_wait(void *handle);
-void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct extent_buffer *eb, int err);
-void btrfs_reada_remove_dev(struct btrfs_device *dev);
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -3892,7 +4008,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
- return fs_info->zoned != 0;
+ return fs_info->zone_size > 0;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
@@ -3909,5 +4025,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
#define PageOrdered(page) PagePrivate2(page)
#define SetPageOrdered(page) SetPagePrivate2(page)
#define ClearPageOrdered(page) ClearPagePrivate2(page)
+#define folio_test_ordered(folio) folio_test_private_2(folio)
+#define folio_set_ordered(folio) folio_set_private_2(folio)
+#define folio_clear_ordered(folio) folio_clear_private_2(folio)
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 40c4d6ba3fb9..36ab0859a263 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
- u64 num_bytes, u64 *meta_reserve,
- u64 *qgroup_reserve)
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
{
u64 nr_extents = count_max_extents(num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
@@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -307,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
- if (btrfs_is_free_space_inode(inode)) {
+ if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
@@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
@@ -329,12 +331,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
- &qgroup_reserve);
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
@@ -349,7 +352,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += num_bytes;
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -454,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e164766dcc38..66779ab3ed4a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -13,6 +13,7 @@
#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
+#include "inode-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -77,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -89,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -99,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -127,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
-again:
- node = btrfs_get_delayed_node(btrfs_inode);
- if (node)
- return node;
+ do {
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
- node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
- if (!node)
- return ERR_PTR(-ENOMEM);
- btrfs_init_delayed_node(node, root, ino);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
- refcount_set(&node->refs, 2);
+ /* Cached in the inode and can be accessed */
+ refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
- }
-
- spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
- spin_unlock(&root->inode_lock);
- kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
- goto again;
- }
+ spin_lock(&root->inode_lock);
+ ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
+ if (ret) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ if (ret != -EBUSY)
+ return ERR_PTR(ret);
+ }
+ } while (ret);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -275,8 +270,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -629,7 +623,7 @@ static int btrfs_delayed_inode_reserve_metadata(
BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
/* NO_FLUSH could only fail with -ENOSPC */
ASSERT(ret == 0 || ret == -ENOSPC);
@@ -1869,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
+ struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ int n = 0;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&delayed_node->refs)) {
+ delayed_nodes[n] = delayed_node;
+ n++;
+ }
+ if (n >= ARRAY_SIZE(delayed_nodes))
+ break;
}
+ index++;
spin_unlock(&root->inode_lock);
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < n; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cca7e85e32dd..99f37fca2e96 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
num_bytes = btrfs_calc_insert_metadata_size(fs_info,
trans->delayed_ref_updates);
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
delayed_rsv->full = 0;
@@ -191,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -909,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -1082,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
- false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 91a3aabad150..d6304b690ec4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op {
u8 level;
bool update_key;
bool update_flags;
- bool is_data;
u64 flags_to_set;
};
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c85a7d44da79..a7dd6ba25e99 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found:
}
slot = path->slots[0];
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
@@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
struct rcu_string *name;
@@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
rcu_assign_pointer(device->name, name);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error;
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
@@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
+ device->fs_devices = fs_devices;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
*device_out = device;
return 0;
@@ -381,7 +385,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/*
* need to delete old one and insert a new one.
* Since no attempt is made to recover any old state, if the
@@ -470,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
+ int iter_ret = 0;
int ret = 0;
u64 chunk_offset;
@@ -520,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- if (path->slots[0] >=
- btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- ret = 0;
- goto free_path;
- }
- } else {
- ret = 0;
- }
- }
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
@@ -553,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (found_key.offset < key.offset)
break;
- dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
- goto skip;
+ continue;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
-
-skip:
- ret = btrfs_next_item(root, path);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- break;
- }
}
+ if (iter_ret < 0)
+ ret = iter_ret;
-free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
@@ -730,7 +707,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -872,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
@@ -906,9 +889,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- if (!scrub_ret)
- btrfs_reada_remove_dev(src_device);
-
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
@@ -917,7 +897,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
@@ -925,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
@@ -967,8 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_reada_undo_remove_dev(src_device);
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
@@ -997,8 +975,8 @@ error:
btrfs_assign_next_active_device(src_device, tgt_device);
- list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
- fs_info->fs_devices->rw_devices++;
+ list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -1021,7 +999,7 @@ error:
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_remove_device(src_device);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7721ce0c0604..72fb2c518a2b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
- struct btrfs_item *item;
struct extent_buffer *leaf;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
- BUG_ON(data_size > btrfs_item_size(leaf, item));
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
+ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
return (struct btrfs_dir_item *)ptr;
}
@@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
data_size = sizeof(*di) + name_len;
leaf = path->nodes[0];
slot = path->slots[0];
- if (data_size + btrfs_item_size_nr(leaf, slot) +
+ if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
ret = -EOVERFLOW;
} else {
@@ -327,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ERR_PTR(ret);
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-
- while (1) {
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_for_each_slot(root, &key, &key, path, ret) {
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
@@ -364,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
name, name_len);
if (di)
return di;
-
- path->slots[0]++;
}
- return NULL;
+ /* Adjust return code if the key was not found in the next leaf. */
+ if (ret > 0)
+ ret = 0;
+
+ return ERR_PTR(ret);
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
@@ -409,7 +388,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ total_len = btrfs_item_size(leaf, path->slots[0]);
while (cur < total_len) {
this_len = sizeof(*dir_item) +
btrfs_dir_name_len(leaf, dir_item) +
@@ -445,7 +424,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di);
- item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_len = btrfs_item_size(leaf, path->slots[0]);
if (sub_item_len == item_len) {
ret = btrfs_del_item(trans, root, path);
} else {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 514ead6e93b6..4ba005c41983 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,7 +5,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
-#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
-static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+int btrfs_read_extent_buffer(struct extent_buffer *eb,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
@@ -441,17 +440,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
else
ret = btrfs_check_leaf_full(eb);
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Also check the generation, the eb reached here must be newer than
+ * last committed. Or something seriously wrong happened.
+ */
+ if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ ret = -EUCLEAN;
btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
+ "block=%llu bad generation, have %llu expect > %llu",
+ eb->start, btrfs_header_generation(eb),
+ fs_info->last_trans_committed);
+ goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
+
+error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
}
/* Checksum all dirty extent buffers in one bio_vec */
@@ -472,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
- /* A dirty eb shouldn't disappear from buffer_radix */
+ /* A dirty eb shouldn't disappear from extent_buffers */
if (WARN_ON(!eb))
return -EUCLEAN;
@@ -505,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
u64 found_start;
struct extent_buffer *eb;
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
@@ -665,9 +678,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
if (ret < 0)
goto err;
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
set_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
@@ -693,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
eb = (struct extent_buffer *)page->private;
@@ -715,10 +725,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
}
ret = validate_extent_buffer(eb);
err:
- if (reads_done &&
- test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
if (ret) {
/*
* our io error hook is going to dec the io pages
@@ -843,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work)
}
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
+ int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -867,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
async->status = 0;
if (op_is_sync(bio->bi_opf))
- btrfs_set_work_high_priority(&async->work);
-
- btrfs_queue_work(fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
return 0;
}
@@ -913,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
@@ -926,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
*/
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_METADATA);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (!ret)
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (!ret)
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- 0, btree_submit_bio_start);
+ btree_submit_bio_start);
}
- if (ret)
- goto out_w_error;
- return 0;
-
-out_w_error:
- bio->bi_status = ret;
- bio_endio(bio);
- return ret;
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
}
#ifdef CONFIG_MIGRATION
@@ -998,49 +996,48 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
- return try_release_extent_buffer(page);
+ return try_release_extent_buffer(&folio->page);
}
-static void btree_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btree_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- extent_invalidatepage(tree, page, offset);
- btree_releasepage(page, GFP_NOFS);
- if (PagePrivate(page)) {
- btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
- "page private not zero on page %llu",
- (unsigned long long)page_offset(page));
- detach_page_private(page);
+ tree = &BTRFS_I(folio->mapping->host)->io_tree;
+ extent_invalidate_folio(tree, folio, offset);
+ btree_release_folio(folio, GFP_NOFS);
+ if (folio_get_private(folio)) {
+ btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ "folio private not zero on folio %llu",
+ (unsigned long long)folio_pos(folio));
+ folio_detach_private(folio);
}
}
-static int btree_set_page_dirty(struct page *page)
-{
#ifdef DEBUG
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+static bool btree_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
- u64 page_start = page_offset(page);
+ u64 page_start = folio_pos(folio);
if (fs_info->sectorsize == PAGE_SIZE) {
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_write_locked(eb);
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(subpage->dirty_bitmap);
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
@@ -1066,18 +1063,20 @@ static int btree_set_page_dirty(struct page *page)
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
}
-#endif
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
+#else
+#define btree_dirty_folio filemap_dirty_folio
+#endif
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
- .releasepage = btree_releasepage,
- .invalidatepage = btree_invalidatepage,
+ .release_folio = btree_release_folio,
+ .invalidate_folio = btree_invalidate_folio,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
- .set_page_dirty = btree_set_page_dirty,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1110,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
+ ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
+ if (btrfs_check_eb_owner(buf, owner_root)) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
return buf;
}
@@ -1140,19 +1142,25 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
root->fs_info = fs_info;
+ root->root_key.objectid = objectid;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
- root->orphan_cleanup_state = 0;
+ RB_CLEAR_NODE(&root->rb_node);
root->last_trans = 0;
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
- root->block_rsv = NULL;
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
+
+ btrfs_init_root_block_rsv(root);
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
@@ -1190,6 +1198,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
+ root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
@@ -1197,19 +1206,13 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
IO_TREE_LOG_CSUM_RANGE, NULL);
}
- memset(&root->root_key, 0, sizeof(root->root_key));
- memset(&root->root_item, 0, sizeof(root->root_item));
- memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- root->root_key.objectid = objectid;
- root->anon_dev = 0;
-
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
- spin_lock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
#endif
}
@@ -1242,6 +1245,102 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
}
#endif
+static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
+{
+ const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
+ const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
+}
+
+static int global_root_key_cmp(const void *k, const struct rb_node *node)
+{
+ const struct btrfs_key *key = k;
+ const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(key, &root->root_key);
+}
+
+int btrfs_global_root_insert(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *tmp;
+
+ write_lock(&fs_info->global_root_lock);
+ tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
+ write_unlock(&fs_info->global_root_lock);
+ ASSERT(!tmp);
+
+ return tmp ? -EEXIST : 0;
+}
+
+void btrfs_global_root_delete(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ write_lock(&fs_info->global_root_lock);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ write_unlock(&fs_info->global_root_lock);
+}
+
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key)
+{
+ struct rb_node *node;
+ struct btrfs_root *root = NULL;
+
+ read_lock(&fs_info->global_root_lock);
+ node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
+ if (node)
+ root = container_of(node, struct btrfs_root, rb_node);
+ read_unlock(&fs_info->global_root_lock);
+
+ return root;
+}
+
+static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ u64 ret;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (bytenr)
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ else
+ block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
+ ASSERT(block_group);
+ if (!block_group)
+ return 0;
+ ret = block_group->global_root_id;
+ btrfs_put_block_group(block_group);
+
+ return ret;
+}
+
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_CSUM_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_EXTENT_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1453,10 +1552,28 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = PTR_ERR(root->node);
root->node = NULL;
goto fail;
- } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ }
+ if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
goto fail;
}
+
+ /*
+ * For real fs, and not log/reloc trees, root owner must
+ * match its root node owner
+ */
+ if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ root->root_key.objectid != btrfs_header_owner(root->node)) {
+ btrfs_crit(fs_info,
+"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+ root->root_key.objectid, root->node->start,
+ btrfs_header_owner(root->node),
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ goto fail;
+ }
root->commit_root = btrfs_root_node(root);
return root;
fail:
@@ -1542,37 +1659,44 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
- spin_lock(&fs_info->fs_roots_radix_lock);
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)root_id);
+ spin_lock(&fs_info->fs_roots_lock);
+ root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return root;
}
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
if (objectid == BTRFS_ROOT_TREE_OBJECTID)
return btrfs_grab_root(fs_info->tree_root);
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->extent_root);
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
return btrfs_grab_root(fs_info->chunk_root);
if (objectid == BTRFS_DEV_TREE_OBJECTID)
return btrfs_grab_root(fs_info->dev_root);
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->csum_root);
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
return btrfs_grab_root(fs_info->quota_root) ?
fs_info->quota_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->free_space_root) ?
- fs_info->free_space_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ struct btrfs_root *root = btrfs_global_root(fs_info, &key);
+
+ return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
+ }
return NULL;
}
@@ -1581,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = radix_tree_preload(GFP_NOFS);
- if (ret)
- return ret;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- root);
+ spin_lock(&fs_info->fs_roots_lock);
+ ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
+ root, GFP_NOFS);
if (ret == 0) {
btrfs_grab_root(root);
- set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ set_bit(BTRFS_ROOT_REGISTERED, &root->state);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
- radix_tree_preload_end();
+ spin_unlock(&fs_info->fs_roots_lock);
return ret;
}
@@ -1619,6 +1737,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
#endif
}
+static void free_global_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct rb_node *node;
+
+ while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
+ root = rb_entry(node, struct btrfs_root, rb_node);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ btrfs_put_root(root);
+ }
+}
+
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
@@ -1630,16 +1760,15 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_free_ref_cache(fs_info);
kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
- btrfs_put_root(fs_info->extent_root);
+ free_global_roots(fs_info);
btrfs_put_root(fs_info->tree_root);
btrfs_put_root(fs_info->chunk_root);
btrfs_put_root(fs_info->dev_root);
- btrfs_put_root(fs_info->csum_root);
btrfs_put_root(fs_info->quota_root);
btrfs_put_root(fs_info->uuid_root);
- btrfs_put_root(fs_info->free_space_root);
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_put_root(fs_info->block_group_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1725,13 +1854,22 @@ again:
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- btrfs_put_root(root);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
goto again;
+ }
goto fail;
}
return root;
fail:
+ /*
+ * If our caller provided us an anonymous device, then it's his
+ * responsability to free it in case we fail. So we have to set our
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
+ * and once again by our caller.
+ */
+ if (anon_dev)
+ root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1830,8 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
- struct btrfs_root *root = arg;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = arg;
int again;
while (1) {
@@ -1864,7 +2001,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(fs_info);
- again = btrfs_clean_one_deleted_snapshot(root);
+ again = btrfs_clean_one_deleted_snapshot(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
/*
@@ -1927,7 +2064,8 @@ static int transaction_kthread(void *arg)
}
delta = ktime_get_seconds() - cur->start_time;
- if (cur->state < TRANS_STATE_COMMIT_START &&
+ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
+ cur->state < TRANS_STATE_COMMIT_START &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
delay -= msecs_to_jiffies((delta - 1) * 1000);
@@ -2023,11 +2161,30 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
- btrfs_set_backup_extent_root_gen(root_backup,
- btrfs_header_generation(info->extent_root->node));
- btrfs_set_backup_extent_root_level(root_backup,
- btrfs_header_level(info->extent_root->node));
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
+ btrfs_set_backup_block_group_root(root_backup,
+ info->block_group_root->node->start);
+ btrfs_set_backup_block_group_root_gen(root_backup,
+ btrfs_header_generation(info->block_group_root->node));
+ btrfs_set_backup_block_group_root_level(root_backup,
+ btrfs_header_level(info->block_group_root->node));
+ } else {
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
+
+ btrfs_set_backup_extent_root(root_backup,
+ extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(extent_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(csum_root->node));
+ }
/*
* we might commit during log recovery, which happens before we set
@@ -2048,12 +2205,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
- btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
- btrfs_set_backup_csum_root_gen(root_backup,
- btrfs_header_generation(info->csum_root->node));
- btrfs_set_backup_csum_root_level(root_backup,
- btrfs_header_level(info->csum_root->node));
-
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
@@ -2119,15 +2270,16 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- btrfs_destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->rmw_workers)
+ destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
- btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers)
@@ -2151,21 +2303,30 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
}
+static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(root, tmp,
+ &fs_info->global_root_tree,
+ rb_node)
+ free_root_extent_buffers(root);
+}
+
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
+ free_global_root_pointers(info);
free_root_extent_buffers(info->dev_root);
- free_root_extent_buffers(info->extent_root);
- free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
+ free_root_extent_buffers(info->block_group_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
- free_root_extent_buffers(info->free_space_root);
}
void btrfs_put_root(struct btrfs_root *root)
@@ -2181,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
- spin_lock(&root->fs_info->fs_roots_radix_lock);
+ spin_lock(&root->fs_info->fs_roots_lock);
list_del_init(&root->leak_list);
- spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ spin_unlock(&root->fs_info->fs_roots_lock);
#endif
kfree(root);
}
@@ -2191,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
- int ret;
- struct btrfs_root *gang[8];
- int i;
+ struct btrfs_root *root;
+ unsigned long index = 0;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&gang[0]->root_list);
+ root = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&root->root_list);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
- btrfs_drop_and_free_fs_root(fs_info, gang[0]);
- btrfs_put_root(gang[0]);
+ if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ btrfs_drop_and_free_fs_root(fs_info, root);
+ btrfs_put_root(root);
}
- while (1) {
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, 0,
- ARRAY_SIZE(gang));
- if (!ret)
- break;
- for (i = 0; i < ret; i++)
- btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+ xa_for_each(&fs_info->fs_roots, index, root) {
+ btrfs_drop_and_free_fs_root(fs_info, root);
}
}
@@ -2283,14 +2437,15 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->qgroup_rescan_lock);
}
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices)
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue(fs_info, "worker",
+ btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+ fs_info->hipri_workers =
+ btrfs_alloc_workqueue(fs_info, "worker-high",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
@@ -2322,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
max_active, 4);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+ fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
@@ -2333,23 +2487,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->delayed_workers =
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0);
- fs_info->readahead_workers =
- btrfs_alloc_workqueue(fs_info, "readahead", flags,
- max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->flush_workers &&
+ if (!(fs_info->workers && fs_info->hipri_workers &&
+ fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->readahead_workers &&
- fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->qgroup_rescan_workers &&
+ fs_info->caching_workers && fs_info->fixup_workers &&
+ fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@@ -2404,11 +2554,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = NULL;
btrfs_put_root(log_tree_root);
return ret;
- } else if (!extent_buffer_uptodate(log_tree_root->node)) {
+ }
+ if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
}
+
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
@@ -2427,6 +2579,115 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return 0;
}
+static int load_global_roots_objectid(struct btrfs_root *tree_root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name)
+{
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *root;
+ u64 max_global_id = 0;
+ int ret;
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ bool found = false;
+
+ /* If we have IGNOREDATACSUMS skip loading these roots. */
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
+ btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ return 0;
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+ ret = 0;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != objectid)
+ break;
+ btrfs_release_path(path);
+
+ /*
+ * Just worry about this for extent tree, it'll be the same for
+ * everybody.
+ */
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ max_global_id = max(max_global_id, key.offset);
+
+ found = true;
+ root = read_tree_root_path(tree_root, path, &key);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = PTR_ERR(root);
+ break;
+ }
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ ret = btrfs_global_root_insert(root);
+ if (ret) {
+ btrfs_put_root(root);
+ break;
+ }
+ key.offset++;
+ }
+ btrfs_release_path(path);
+
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ fs_info->nr_global_roots = max_global_id + 1;
+
+ if (!found || ret) {
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = ret ? ret : -ENOENT;
+ else
+ ret = 0;
+ btrfs_err(fs_info, "failed to load root %s", name);
+ }
+ return ret;
+}
+
+static int load_global_roots(struct btrfs_root *tree_root)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_EXTENT_TREE_OBJECTID, "extent");
+ if (ret)
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_CSUM_TREE_OBJECTID, "csum");
+ if (ret)
+ goto out;
+ if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_FREE_SPACE_TREE_OBJECTID,
+ "free space");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -2436,7 +2697,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
BUG_ON(!fs_info->tree_root);
- location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ ret = load_global_roots(tree_root);
+ if (ret)
+ return ret;
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
@@ -2448,38 +2713,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
- }
-
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
btrfs_init_devices_late(fs_info);
- /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
- if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
- }
- }
-
/*
* This tree can share blocks with some other fs tree during relocation
* and we need a proper setup by btrfs_get_fs_root
@@ -2517,20 +2755,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
- }
- }
-
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2591,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info,
}
/*
- * For 4K page size, we only support 4K sector size.
- * For 64K page size, we support 64K and 4K sector sizes.
+ * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ *
+ * We can support 16K sectorsize with 64K page size without problem,
+ * but such sectorsize/pagesize combination doesn't make much sense.
+ * 4K will be our future standard, PAGE_SIZE is supported from the very
+ * beginning.
*/
- if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
- (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
- sectorsize != SZ_64K))) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2769,6 +2995,56 @@ out:
return ret;
}
+static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
+{
+ int ret = 0;
+
+ root->node = read_tree_block(root->fs_info, bytenr,
+ root->root_key.objectid, gen, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ }
+ if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ root->node = NULL;
+ return -EIO;
+ }
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ root->commit_root = btrfs_root_node(root);
+ btrfs_set_root_refs(&root->root_item, 1);
+ return ret;
+}
+
+static int load_important_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 gen, bytenr;
+ int level, ret;
+
+ bytenr = btrfs_super_root(sb);
+ gen = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read tree root");
+ return ret;
+ }
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ bytenr = btrfs_super_block_group_root(sb);
+ gen = btrfs_super_block_group_root_generation(sb);
+ level = btrfs_super_block_group_root_level(sb);
+ ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
+ if (ret)
+ btrfs_warn(fs_info, "couldn't read block group root");
+ return ret;
+}
+
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
int backup_index = find_newest_super_backup(fs_info);
@@ -2778,10 +3054,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
- u64 generation;
- int level;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ struct btrfs_root *root;
+ root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
+ GFP_KERNEL);
+ if (!root)
+ return -ENOMEM;
+ fs_info->block_group_root = root;
+ }
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
@@ -2806,29 +3089,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
}
- generation = btrfs_super_generation(sb);
- level = btrfs_super_root_level(sb);
- tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
- BTRFS_ROOT_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(tree_root->node)) {
- handle_error = true;
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- btrfs_warn(fs_info, "couldn't read tree root");
- continue;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
+ ret = load_important_roots(fs_info);
+ if (ret) {
handle_error = true;
- ret = -EIO;
- btrfs_warn(fs_info, "error while reading tree root");
continue;
}
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
/*
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
@@ -2848,8 +3115,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
}
/* All successful */
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ fs_info->generation = btrfs_header_generation(tree_root->node);
+ fs_info->last_trans_committed = fs_info->generation;
+ fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
if (backup_index < 0) {
@@ -2866,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
+ xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2875,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->fs_roots_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@@ -2885,11 +3153,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->zone_active_bgs_lock);
spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
+ rwlock_init(&fs_info->global_root_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2916,9 +3186,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
- atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->global_root_tree = RB_ROOT;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
@@ -2926,9 +3196,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
- /* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2944,13 +3211,11 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
- spin_lock_init(&fs_info->block_group_cache_lock);
- fs_info->block_group_cache_tree = RB_ROOT;
- fs_info->first_logical_byte = (u64)-1;
+ rwlock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
- set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -2985,9 +3250,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
- spin_lock_init(&fs_info->send_reloc_lock);
- fs_info->send_in_progress = 0;
-
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
@@ -3034,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
static int btrfs_uuid_rescan_kthread(void *data)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ struct btrfs_fs_info *fs_info = data;
int ret;
/*
@@ -3112,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
- * them into the fs_info->fs_roots_radix tree. This must be done before
+ * them into the fs_info->fs_roots. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@@ -3137,7 +3399,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
up_read(&fs_info->cleanup_work_sem);
mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(fs_info->tree_root);
+ ret = btrfs_recover_relocation(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
@@ -3350,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
+ "cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3388,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3397,17 +3659,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- if (btrfs_super_incompat_flags(fs_info->super_copy) &
- BTRFS_FEATURE_INCOMPAT_RAID56) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
- err = -EINVAL;
- goto fail_alloc;
- }
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
if (!subpage_info)
goto fail_alloc;
@@ -3415,7 +3680,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->subpage_info = subpage_info;
}
- ret = btrfs_init_workqueues(fs_info, fs_devices);
+ ret = btrfs_init_workqueues(fs_info);
if (ret) {
err = ret;
goto fail_sb_buffer;
@@ -3438,21 +3703,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
-
- chunk_root->node = read_tree_block(fs_info,
- btrfs_super_chunk_root(disk_super),
- BTRFS_CHUNK_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(chunk_root->node) ||
- !extent_buffer_uptodate(chunk_root->node)) {
+ ret = load_super_root(ch