aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dir.c21
-rw-r--r--fs/9p/vfs_file.c24
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/afs/Kconfig12
-rw-r--r--fs/afs/Makefile7
-rw-r--r--fs/afs/addr_list.c300
-rw-r--r--fs/afs/afs.h50
-rw-r--r--fs/afs/cache.c2
-rw-r--r--fs/afs/callback.c17
-rw-r--r--fs/afs/cell.c82
-rw-r--r--fs/afs/cmservice.c287
-rw-r--r--fs/afs/dir.c75
-rw-r--r--fs/afs/dynroot.c6
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/afs/flock.c22
-rw-r--r--fs/afs/fs_probe.c270
-rw-r--r--fs/afs/fsclient.c583
-rw-r--r--fs/afs/inode.c37
-rw-r--r--fs/afs/internal.h332
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/afs/proc.c132
-rw-r--r--fs/afs/protocol_yfs.h163
-rw-r--r--fs/afs/rotate.c302
-rw-r--r--fs/afs/rxrpc.c117
-rw-r--r--fs/afs/security.c13
-rw-r--r--fs/afs/server.c145
-rw-r--r--fs/afs/server_list.c6
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/vl_list.c340
-rw-r--r--fs/afs/vl_probe.c273
-rw-r--r--fs/afs/vl_rotate.c355
-rw-r--r--fs/afs/vlclient.c195
-rw-r--r--fs/afs/volume.c56
-rw-r--r--fs/afs/write.c30
-rw-r--r--fs/afs/xattr.c2
-rw-r--r--fs/afs/yfsclient.c2184
-rw-r--r--fs/aio.c8
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/backref.c39
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/check-integrity.c6
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c85
-rw-r--r--fs/btrfs/ctree.h68
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-inode.h4
-rw-r--r--fs/btrfs/delayed-ref.c107
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c64
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/disk-io.c25
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c478
-rw-r--r--fs/btrfs/extent_io.c45
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c32
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file.c45
-rw-r--r--fs/btrfs/free-space-cache.c48
-rw-r--r--fs/btrfs/inode.c264
-rw-r--r--fs/btrfs/ioctl.c53
-rw-r--r--fs/btrfs/qgroup.c460
-rw-r--r--fs/btrfs/qgroup.h8
-rw-r--r--fs/btrfs/ref-verify.c8
-rw-r--r--fs/btrfs/relocation.c74
-rw-r--r--fs/btrfs/scrub.c34
-rw-r--r--fs/btrfs/send.c24
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tests/extent-io-tests.c10
-rw-r--r--fs/btrfs/tests/extent-map-tests.c4
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/tree-log.c139
-rw-r--r--fs/btrfs/tree-log.h12
-rw-r--r--fs/btrfs/volumes.c124
-rw-r--r--fs/btrfs/volumes.h9
-rw-r--r--fs/buffer.c25
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/ceph/acl.c13
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/file.c578
-rw-r--r--fs/ceph/inode.c13
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/super.c29
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_debug.c17
-rw-r--r--fs/cifs/cifs_debug.h28
-rw-r--r--fs/cifs/cifs_dfs_ref.c7
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h11
-rw-r--r--fs/cifs/cifs_unicode.c3
-rw-r--r--fs/cifs/cifsfs.c30
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/cifssmb.c34
-rw-r--r--fs/cifs/connect.c32
-rw-r--r--fs/cifs/file.c60
-rw-r--r--fs/cifs/inode.c75
-rw-r--r--fs/cifs/ioctl.c48
-rw-r--r--fs/cifs/misc.c14
-rw-r--r--fs/cifs/readdir.c11
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2inode.c332
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2misc.c14
-rw-r--r--fs/cifs/smb2ops.c253
-rw-r--r--fs/cifs/smb2pdu.c292
-rw-r--r--fs/cifs/smb2pdu.h13
-rw-r--r--fs/cifs/smb2proto.h28
-rw-r--r--fs/cifs/smbdirect.c55
-rw-r--r--fs/cifs/trace.h109
-rw-r--r--fs/cifs/transport.c107
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/compat_ioctl.c369
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/cramfs/inode.c12
-rw-r--r--fs/crypto/fscrypt_private.h4
-rw-r--r--fs/crypto/keyinfo.c10
-rw-r--r--fs/dax.c918
-rw-r--r--fs/dcache.c40
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/ecryptfs/inode.c11
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/ext2.h4
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c5
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/dir.c20
-rw-r--r--fs/ext4/ext4.h28
-rw-r--r--fs/ext4/ext4_extents.h13
-rw-r--r--fs/ext4/extents.c595
-rw-r--r--fs/ext4/extents_status.c654
-rw-r--r--fs/ext4/extents_status.h80
-rw-r--r--fs/ext4/inline.c6
-rw-r--r--fs/ext4/inode.c164
-rw-r--r--fs/ext4/ioctl.c97
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/ext4/mmp.c1
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c8
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c23
-rw-r--r--fs/ext4/super.c85
-rw-r--r--fs/f2fs/acl.c12
-rw-r--r--fs/f2fs/acl.h5
-rw-r--r--fs/f2fs/checkpoint.c94
-rw-r--r--fs/f2fs/data.c226
-rw-r--r--fs/f2fs/debug.c35
-rw-r--r--fs/f2fs/dir.c32
-rw-r--r--fs/f2fs/extent_cache.c134
-rw-r--r--fs/f2fs/f2fs.h255
-rw-r--r--fs/f2fs/file.c196
-rw-r--r--fs/f2fs/gc.c115
-rw-r--r--fs/f2fs/gc.h5
-rw-r--r--fs/f2fs/hash.c5
-rw-r--r--fs/f2fs/inline.c10
-rw-r--r--fs/f2fs/inode.c28
-rw-r--r--fs/f2fs/namei.c57
-rw-r--r--fs/f2fs/node.c89
-rw-r--r--fs/f2fs/node.h5
-rw-r--r--fs/f2fs/recovery.c125
-rw-r--r--fs/f2fs/segment.c240
-rw-r--r--fs/f2fs/segment.h20
-rw-r--r--fs/f2fs/shrinker.c5
-rw-r--r--fs/f2fs/super.c415
-rw-r--r--fs/f2fs/sysfs.c17
-rw-r--r--fs/f2fs/trace.c5
-rw-r--r--fs/f2fs/trace.h5
-rw-r--r--fs/f2fs/xattr.c5
-rw-r--r--fs/f2fs/xattr.h5
-rw-r--r--fs/fat/dir.c6
-rw-r--r--fs/fat/fat.h4
-rw-r--r--fs/fat/fatent.c1
-rw-r--r--fs/fat/file.c17
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fat/misc.c91
-rw-r--r--fs/fat/namei_msdos.c17
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fs-writeback.c25
-rw-r--r--fs/fscache/cookie.c31
-rw-r--r--fs/fscache/internal.h1
-rw-r--r--fs/fscache/main.c4
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c34
-rw-r--r--fs/fuse/dev.c221
-rw-r--r--fs/fuse/dir.c381
-rw-r--r--fs/fuse/file.c160
-rw-r--r--fs/fuse/fuse_i.h124
-rw-r--r--fs/fuse/inode.c53
-rw-r--r--fs/fuse/readdir.c569
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c6
-rw-r--r--fs/gfs2/dir.c28
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/glock.c17
-rw-r--r--fs/gfs2/incore.h9
-rw-r--r--fs/gfs2/lock_dlm.c10
-rw-r--r--fs/gfs2/log.c11
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/ops_fstype.c5
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/rgrp.c201
-rw-r--r--fs/gfs2/rgrp.h11
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/trans.c15
-rw-r--r--fs/gfs2/util.c16
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/gfs2/xattr.c18
-rw-r--r--fs/hfs/brec.c5
-rw-r--r--fs/hfs/btree.c41
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfs/catalog.c16
-rw-r--r--fs/hfs/extent.c10
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/attributes.c10
-rw-r--r--fs/hfsplus/brec.c5
-rw-r--r--fs/hfsplus/btree.c44
-rw-r--r--fs/hfsplus/catalog.c24
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/inode.c4
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/iomap.c6
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/jbd2/checkpoint.c4
-rw-r--r--fs/jffs2/background.c2
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/mount.c3
-rw-r--r--fs/kernfs/symlink.c5
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/delegation.c17
-rw-r--r--fs/nfs/dir.c295
-rw-r--r--fs/nfs/dns_resolve.c15
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/inode.c70
-rw-r--r--fs/nfs/nfs3proc.c5
-rw-r--r--fs/nfs/nfs3xdr.c10
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4client.c16
-rw-r--r--fs/nfs/nfs4proc.c84
-rw-r--r--fs/nfs/nfs4state.c256
-rw-r--r--fs/nfs/nfs4trace.h4
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/pagelist.c49
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfs/read.c10
-rw-r--r--fs/nfsd/cache.h20
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/netns.h8
-rw-r--r--fs/nfsd/nfs4callback.c98
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4proc.c289
-rw-r--r--fs/nfsd/nfs4state.c41
-rw-r--r--fs/nfsd/nfs4xdr.c50
-rw-r--r--fs/nfsd/nfscache.c142
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/state.h10
-rw-r--r--fs/nfsd/vfs.c12
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nfsd/xdr4cb.h10
-rw-r--r--fs/nilfs2/alloc.c11
-rw-r--r--fs/nilfs2/alloc.h11
-rw-r--r--fs/nilfs2/bmap.c11
-rw-r--r--fs/nilfs2/bmap.h11
-rw-r--r--fs/nilfs2/btnode.c37
-rw-r--r--fs/nilfs2/btnode.h11
-rw-r--r--fs/nilfs2/btree.c11
-rw-r--r--fs/nilfs2/btree.h11
-rw-r--r--fs/nilfs2/cpfile.c11
-rw-r--r--fs/nilfs2/cpfile.h11
-rw-r--r--fs/nilfs2/dat.c11
-rw-r--r--fs/nilfs2/dat.h11
-rw-r--r--fs/nilfs2/dir.c11
-rw-r--r--fs/nilfs2/direct.c11
-rw-r--r--fs/nilfs2/direct.h11
-rw-r--r--fs/nilfs2/file.c11
-rw-r--r--fs/nilfs2/gcinode.c11
-rw-r--r--fs/nilfs2/ifile.c11
-rw-r--r--fs/nilfs2/ifile.h11
-rw-r--r--fs/nilfs2/inode.c11
-rw-r--r--fs/nilfs2/ioctl.c11
-rw-r--r--fs/nilfs2/mdt.c11
-rw-r--r--fs/nilfs2/mdt.h11
-rw-r--r--fs/nilfs2/namei.c11
-rw-r--r--fs/nilfs2/nilfs.h11
-rw-r--r--fs/nilfs2/page.c40
-rw-r--r--fs/nilfs2/page.h11
-rw-r--r--fs/nilfs2/recovery.c11
-rw-r--r--fs/nilfs2/segbuf.c11
-rw-r--r--fs/nilfs2/segbuf.h11
-rw-r--r--fs/nilfs2/segment.c11
-rw-r--r--fs/nilfs2/segment.h11
-rw-r--r--fs/nilfs2/sufile.c11
-rw-r--r--fs/nilfs2/sufile.h11
-rw-r--r--fs/nilfs2/super.c11
-rw-r--r--fs/nilfs2/sysfs.c11
-rw-r--r--fs/nilfs2/sysfs.h11
-rw-r--r--fs/nilfs2/the_nilfs.c11
-rw-r--r--fs/nilfs2/the_nilfs.h11
-rw-r--r--fs/notify/fanotify/fanotify.c17
-rw-r--r--fs/notify/fanotify/fanotify.h4
-rw-r--r--fs/notify/fanotify/fanotify_user.c103
-rw-r--r--fs/notify/fdinfo.c29
-rw-r--r--fs/notify/fsnotify.c55
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/mark.c49
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/buffer_head_io.c1
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/refcounttree.c18
-rw-r--r--fs/orangefs/acl.c4
-rw-r--r--fs/orangefs/inode.c10
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c2
-rw-r--r--fs/overlayfs/copy_up.c215
-rw-r--r--fs/overlayfs/dir.c34
-rw-r--r--fs/overlayfs/file.c25
-rw-r--r--fs/overlayfs/inode.c27
-rw-r--r--fs/overlayfs/namei.c6
-rw-r--r--fs/overlayfs/overlayfs.h18
-rw-r--r--fs/overlayfs/super.c94
-rw-r--r--fs/overlayfs/util.c49
-rw-r--r--fs/proc/base.c32
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/loadavg.c3
-rw-r--r--fs/proc/meminfo.c16
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/proc/vmcore.c38
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/internal.h5
-rw-r--r--fs/pstore/platform.c75
-rw-r--r--fs/pstore/ram.c47
-rw-r--r--fs/pstore/ram_core.c28
-rw-r--r--fs/quota/quota.c14
-rw-r--r--fs/read_write.c19
-rw-r--r--fs/reiserfs/Makefile9
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/select.c20
-rw-r--r--fs/signalfd.c6
-rw-r--r--fs/splice.c7
-rw-r--r--fs/stat.c3
-rw-r--r--fs/super.c2
-rw-r--r--fs/timerfd.c12
-rw-r--r--fs/ubifs/super.c11
-rw-r--r--fs/ubifs/xattr.c24
-rw-r--r--fs/udf/balloc.c30
-rw-r--r--fs/udf/super.c232
-rw-r--r--fs/udf/udf_sb.h10
-rw-r--r--fs/userfaultfd.c8
-rw-r--r--fs/utimes.c73
-rw-r--r--fs/xattr.c24
-rw-r--r--fs/xfs/libxfs/xfs_attr.c264
-rw-r--r--fs/xfs/libxfs/xfs_attr.h (renamed from fs/xfs/xfs_attr.h)2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c10
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c94
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h1
-rw-r--r--fs/xfs/libxfs/xfs_format.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c30
-rw-r--r--fs/xfs/libxfs/xfs_sb.c5
-rw-r--r--fs/xfs/scrub/alloc.c1
-rw-r--r--fs/xfs/scrub/inode.c4
-rw-r--r--fs/xfs/scrub/repair.c128
-rw-r--r--fs/xfs/scrub/scrub.c13
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_aops.h14
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c109
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c119
-rw-r--r--fs/xfs/xfs_buf_item.h1
-rw-r--r--fs/xfs/xfs_fsops.c50
-rw-r--r--fs/xfs/xfs_inode.c10
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c53
-rw-r--r--fs/xfs/xfs_iops.c12
-rw-r--r--fs/xfs/xfs_log_recover.c10
-rw-r--r--fs/xfs/xfs_reflink.c362
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_stats.c52
-rw-r--r--fs/xfs/xfs_stats.h28
-rw-r--r--fs/xfs/xfs_super.c38
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c28
-rw-r--r--fs/xfs/xfs_trans_buf.c141
421 files changed, 16640 insertions, 7686 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227fa56b..6261719f6f2a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
- struct iattr iattr;
+ struct iattr iattr = { 0 };
struct posix_acl *old_acl = acl;
retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 89bac3d2f05b..619128b55837 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -61,6 +61,8 @@ enum {
Opt_cache_loose, Opt_fscache, Opt_mmap,
/* Access options */
Opt_access, Opt_posixacl,
+ /* Lock timeout option */
+ Opt_locktimeout,
/* Error token */
Opt_err
};
@@ -80,6 +82,7 @@ static const match_table_t tokens = {
{Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_posixacl, "posixacl"},
+ {Opt_locktimeout, "locktimeout=%u"},
{Opt_err, NULL}
};
@@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FSCACHE
v9ses->cachetag = NULL;
#endif
+ v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
if (!opts)
return 0;
@@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#endif
break;
+ case Opt_locktimeout:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
+ if (option < 1) {
+ p9_debug(P9_DEBUG_ERROR,
+ "locktimeout must be a greater than zero integer.\n");
+ ret = -EINVAL;
+ continue;
+ }
+ v9ses->session_lock_timeout = (long)option * HZ;
+ break;
+
default:
continue;
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 982e017acadb..129e5243a6bf 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,7 @@ struct v9fs_session_info {
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct rw_semaphore rename_sem;
+ long session_lock_timeout; /* retry interval for blocking locks */
};
/* cache_validity flags */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e1cbdfdb7c68..0bcbcc20f769 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -65,7 +65,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
if (retval == 0)
return retval;
- iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE);
+ iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE);
retval = p9_client_read(fid, page_offset(page), &to, &err);
if (err) {
@@ -175,7 +175,7 @@ static int v9fs_vfs_writepage_locked(struct page *page)
bvec.bv_page = page;
bvec.bv_offset = 0;
bvec.bv_len = len;
- iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len);
+ iov_iter_bvec(&from, WRITE, &bvec, 1, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b0405d6aac85..00745147329d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat)
return rettype;
}
-static void p9stat_init(struct p9_wstat *stbuf)
-{
- stbuf->name = NULL;
- stbuf->uid = NULL;
- stbuf->gid = NULL;
- stbuf->muid = NULL;
- stbuf->extension = NULL;
-}
-
/**
* v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
* @filp: opened file structure
@@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int err = 0;
struct p9_fid *fid;
int buflen;
- int reclen = 0;
struct p9_rdir *rdir;
struct kvec kvec;
@@ -133,7 +123,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (rdir->tail == rdir->head) {
struct iov_iter to;
int n;
- iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen);
+ iov_iter_kvec(&to, READ, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
if (err)
@@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
rdir->tail = n;
}
while (rdir->head < rdir->tail) {
- p9stat_init(&st);
err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
rdir->tail - rdir->head, &st);
- if (err) {
+ if (err <= 0) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- p9stat_free(&st);
return -EIO;
}
- reclen = st.size+2;
over = !dir_emit(ctx, st.name, strlen(st.name),
v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (over)
return 0;
- rdir->head += reclen;
- ctx->pos += reclen;
+ rdir->head += err;
+ ctx->pos += err;
}
}
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5f2e48d41d72..a25efa782fcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
uint8_t status = P9_LOCK_ERROR;
int res = 0;
unsigned char fl_type;
+ struct v9fs_session_info *v9ses;
fid = filp->private_data;
BUG_ON(fid == NULL);
@@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if (IS_SETLKW(cmd))
flock.flags = P9_LOCK_FLAGS_BLOCK;
+ v9ses = v9fs_inode2v9ses(file_inode(filp));
+
/*
* if its a blocked request and we get P9_LOCK_BLOCKED as the status
* for lock request, keep on trying
@@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
break;
- if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+ if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+ != 0)
break;
+ /*
+ * p9_client_lock_dotl overwrites flock.client_id with the
+ * server message, free and reuse the client name
+ */
+ if (flock.client_id != fid->clnt->name) {
+ kfree(flock.client_id);
+ flock.client_id = fid->clnt->name;
+ }
}
/* map 9p status to VFS status */
@@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fallthough */
+ /* fall through */
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -235,6 +247,8 @@ out_unlock:
locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
+ if (flock.client_id != fid->clnt->name)
+ kfree(flock.client_id);
out:
return res;
}
@@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
res = p9_client_getlock_dotl(fid, &glock);
if (res < 0)
- return res;
+ goto out;
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
@@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = glock.start + glock.length - 1;
fl->fl_pid = -glock.proc_id;
}
- kfree(glock.client_id);
+out:
+ if (glock.client_id != fid->clnt->name)
+ kfree(glock.client_id);
return res;
}
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 352abc39e891..ac8ff8ca4c11 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,7 +32,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
struct iov_iter to;
int err;
- iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size);
+ iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
@@ -107,7 +107,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
struct iov_iter from;
int retval, err;
- iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len);
+ iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index ebba3b18e5da..701aaa9b1899 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -27,3 +27,15 @@ config AFS_FSCACHE
help
Say Y here if you want AFS data to be cached locally on disk through
the generic filesystem cache manager
+
+config AFS_DEBUG_CURSOR
+ bool "AFS server cursor debugging"
+ depends on AFS_FS
+ help
+ Say Y here to cause the contents of a server cursor to be dumped to
+ the dmesg log if the server rotation algorithm fails to successfully
+ contact a server.
+
+ See <file:Documentation/filesystems/afs.txt> for more information.
+
+ If unsure, say N.
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 546874057bd3..0738e2bf5193 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -17,6 +17,7 @@ kafs-y := \
file.o \
flock.o \
fsclient.o \
+ fs_probe.o \
inode.o \
main.o \
misc.o \
@@ -29,9 +30,13 @@ kafs-y := \
super.o \
netdevices.o \
vlclient.o \
+ vl_list.o \
+ vl_probe.o \
+ vl_rotate.o \
volume.o \
write.o \
- xattr.o
+ xattr.o \
+ yfsclient.o
kafs-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_AFS_FS) := kafs.o
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 025a9a5e1c32..967db336d11a 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -17,11 +17,6 @@
#include "internal.h"
#include "afs_fs.h"
-//#define AFS_MAX_ADDRESSES
-// ((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) /
-// sizeof(struct sockaddr_rxrpc)))
-#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
-
/*
* Release an address list.
*/
@@ -43,11 +38,15 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
_enter("%u,%u,%u", nr, service, port);
+ if (nr > AFS_MAX_ADDRESSES)
+ nr = AFS_MAX_ADDRESSES;
+
alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL);
if (!alist)
return NULL;
refcount_set(&alist->usage, 1);
+ alist->max_addrs = nr;
for (i = 0; i < nr; i++) {
struct sockaddr_rxrpc *srx = &alist->addrs[i];
@@ -65,19 +64,25 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
/*
* Parse a text string consisting of delimited addresses.
*/
-struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
- char delim,
- unsigned short service,
- unsigned short port)
+struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+ const char *text, size_t len,
+ char delim,
+ unsigned short service,
+ unsigned short port)
{
+ struct afs_vlserver_list *vllist;
struct afs_addr_list *alist;
const char *p, *end = text + len;
+ const char *problem;
unsigned int nr = 0;
+ int ret = -ENOMEM;
_enter("%*.*s,%c", (int)len, (int)len, text, delim);
- if (!len)
+ if (!len) {
+ _leave(" = -EDESTADDRREQ [empty]");
return ERR_PTR(-EDESTADDRREQ);
+ }
if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
delim = ',';
@@ -85,18 +90,24 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
/* Count the addresses */
p = text;
do {
- if (!*p)
- return ERR_PTR(-EINVAL);
+ if (!*p) {
+ problem = "nul";
+ goto inval;
+ }
if (*p == delim)
continue;
nr++;
if (*p == '[') {
p++;
- if (p == end)
- return ERR_PTR(-EINVAL);
+ if (p == end) {
+ problem = "brace1";
+ goto inval;
+ }
p = memchr(p, ']', end - p);
- if (!p)
- return ERR_PTR(-EINVAL);
+ if (!p) {
+ problem = "brace2";
+ goto inval;
+ }
p++;
if (p >= end)
break;
@@ -109,18 +120,27 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
} while (p < end);
_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
- if (nr > AFS_MAX_ADDRESSES)
- nr = AFS_MAX_ADDRESSES;
- alist = afs_alloc_addrlist(nr, service, port);
- if (!alist)
+ vllist = afs_alloc_vlserver_list(1);
+ if (!vllist)
return ERR_PTR(-ENOMEM);
+ vllist->nr_servers = 1;
+ vllist->servers[0].server = afs_alloc_vlserver("<dummy>", 7, AFS_VL_PORT);
+ if (!vllist->servers[0].server)
+ goto error_vl;
+
+ alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
+ if (!alist)
+ goto error;
+
/* Extract the addresses */
p = text;
do {
- struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
const char *q, *stop;
+ unsigned int xport = port;
+ __be32 x[4];
+ int family;
if (*p == delim) {
p++;
@@ -136,58 +156,74 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
break;
}
- if (in4_pton(p, q - p,
- (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
- -1, &stop)) {
- srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
- srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
- srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
- } else if (in6_pton(p, q - p,
- srx->transport.sin6.sin6_addr.s6_addr,
- -1, &stop)) {
- /* Nothing to do */
+ if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) {
+ family = AF_INET;
+ } else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) {
+ family = AF_INET6;
} else {
+ problem = "family";
goto bad_address;
}
- if (stop != q)
+ p = q;
+ if (stop != p) {
+ problem = "nostop";
goto bad_address;
+ }
- p = q;
if (q < end && *q == ']')
p++;
if (p < end) {
if (*p == '+') {
/* Port number specification "+1234" */
- unsigned int xport = 0;
+ xport = 0;
p++;
- if (p >= end || !isdigit(*p))
+ if (p >= end || !isdigit(*p)) {
+ problem = "port";
goto bad_address;
+ }
do {
xport *= 10;
xport += *p - '0';
- if (xport > 65535)
+ if (xport > 65535) {
+ problem = "pval";
goto bad_address;
+ }
p++;
} while (p < end && isdigit(*p));
- srx->transport.sin6.sin6_port = htons(xport);
} else if (*p == delim) {
p++;
} else {
+ problem = "weird";
goto bad_address;
}
}
- alist->nr_addrs++;
- } while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
+ if (family == AF_INET)
+ afs_merge_fs_addr4(alist, x[0], xport);
+ else
+ afs_merge_fs_addr6(alist, x, xport);
+ } while (p < end);
+
+ rcu_assign_pointer(vllist->servers[0].server->addresses, alist);
_leave(" = [nr %u]", alist->nr_addrs);
- return alist;
+ return vllist;
-bad_address:
- kfree(alist);
+inval:
+ _leave(" = -EINVAL [%s %zu %*.*s]",
+ problem, p - text, (int)len, (int)len, text);
return ERR_PTR(-EINVAL);
+bad_address:
+ _leave(" = -EINVAL [%s %zu %*.*s]",
+ problem, p - text, (int)len, (int)len, text);
+ ret = -EINVAL;
+error:
+ afs_put_addrlist(alist);
+error_vl:
+ afs_put_vlserverlist(net, vllist);
+ return ERR_PTR(ret);
}
/*
@@ -206,30 +242,34 @@ static int afs_cmp_addr_list(const struct afs_addr_list *a1,
/*
* Perform a DNS query for VL servers and build a up an address list.
*/
-struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
+struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
{
- struct afs_addr_list *alist;
- char *vllist = NULL;
+ struct afs_vlserver_list *vllist;
+ char *result = NULL;
int ret;
_enter("%s", cell->name);
- ret = dns_query("afsdb", cell->name, cell->name_len,
- "", &vllist, _expiry);
- if (ret < 0)
+ ret = dns_query("afsdb", cell->name, cell->name_len, "srv=1",
+ &result, _expiry);
+ if (ret < 0) {
+ _leave(" = %d [dns]", ret);
return ERR_PTR(ret);
-
- alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
- VL_SERVICE, AFS_VL_PORT);
- if (IS_ERR(alist)) {
- kfree(vllist);
- if (alist != ERR_PTR(-ENOMEM))
- pr_err("Failed to parse DNS data\n");
- return alist;
}
- kfree(vllist);
- return alist;
+ if (*_expiry == 0)
+ *_expiry = ktime_get_real_seconds() + 60;
+
+ if (ret > 1 && result[0] == 0)
+ vllist = afs_extract_vlserver_list(cell, result, ret);
+ else
+ vllist = afs_parse_text_addrs(cell->net, result, ret, ',',
+ VL_SERVICE, AFS_VL_PORT);
+ kfree(result);
+ if (IS_ERR(vllist) && vllist != ERR_PTR(-ENOMEM))
+ pr_err("Failed to parse DNS data %ld\n", PTR_ERR(vllist));
+
+ return vllist;
}
/*
@@ -237,19 +277,23 @@ struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
*/
void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
{
- struct sockaddr_in6 *a;
- __be16 xport = htons(port);
+ struct sockaddr_rxrpc *srx;
+ u32 addr = ntohl(xdr);
int i;
+ if (alist->nr_addrs >= alist->max_addrs)
+ return;
+
for (i = 0; i < alist->nr_ipv4; i++) {
- a = &alist->addrs[i].transport.sin6;
- if (xdr == a->sin6_addr.s6_addr32[3] &&
- xport == a->sin6_port)
+ struct sockaddr_in *a = &alist->addrs[i].transport.sin;
+ u32 a_addr = ntohl(a->sin_addr.s_addr);
+ u16 a_port = ntohs(a->sin_port);
+
+ if (addr == a_addr && port == a_port)
return;
- if (xdr == a->sin6_addr.s6_addr32[3] &&
- (u16 __force)xport < (u16 __force)a->sin6_port)
+ if (addr == a_addr && port < a_port)
break;
- if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3])
+ if (addr < a_addr)
break;
}
@@ -258,12 +302,13 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- a = &alist->addrs[i].transport.sin6;
- a->sin6_port = xport;
- a->sin6_addr.s6_addr32[0] = 0;
- a->sin6_addr.s6_addr32[1] = 0;
- a->sin6_addr.s6_addr32[2] = htonl(0xffff);
- a->sin6_addr.s6_addr32[3] = xdr;
+ srx = &alist->addrs[i];
+ srx->srx_family = AF_RXRPC;
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin);
+ srx->transport.sin.sin_family = AF_INET;
+ srx->transport.sin.sin_port = htons(port);
+ srx->transport.sin.sin_addr.s_addr = xdr;
alist->nr_ipv4++;
alist->nr_addrs++;
}
@@ -273,18 +318,20 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
*/
void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
{
- struct sockaddr_in6 *a;
- __be16 xport = htons(port);
+ struct sockaddr_rxrpc *srx;
int i, diff;
+ if (alist->nr_addrs >= alist->max_addrs)
+ return;
+
for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- a = &alist->addrs[i].transport.sin6;
+ struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
+ u16 a_port = ntohs(a->sin6_port);
+
diff = memcmp(xdr, &a->sin6_addr, 16);
- if (diff == 0 &&
- xport == a->sin6_port)
+ if (diff == 0 && port == a_port)
return;
- if (diff == 0 &&
- (u16 __force)xport < (u16 __force)a->sin6_port)
+ if (diff == 0 && port < a_port)
break;
if (diff < 0)
break;
@@ -295,12 +342,13 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- a = &alist->addrs[i].transport.sin6;
- a->sin6_port = xport;
- a->sin6_addr.s6_addr32[0] = xdr[0];
- a->sin6_addr.s6_addr32[1] = xdr[1];
- a->sin6_addr.s6_addr32[2] = xdr[2];
- a->sin6_addr.s6_addr32[3] = xdr[3];
+ srx = &alist->addrs[i];
+ srx->srx_family = AF_RXRPC;
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin6);
+ srx->transport.sin6.sin6_family = AF_INET6;
+ srx->transport.sin6.sin6_port = htons(port);
+ memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
alist->nr_addrs++;
}
@@ -309,25 +357,33 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
*/
bool afs_iterate_addresses(struct afs_addr_cursor *ac)
{
- _enter("%hu+%hd", ac->start, (short)ac->index);
+ unsigned long set, failed;
+ int index;
if (!ac->alist)
return false;
- if (ac->begun) {
- ac->index++;
- if (ac->index == ac->alist->nr_addrs)
- ac->index = 0;
+ set = ac->alist->responded;
+ failed = ac->alist->failed;
+ _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index);
- if (ac->index == ac->start) {
- ac->error = -EDESTADDRREQ;
- return false;
- }
- }
+ ac->nr_iterations++;
- ac->begun = true;
+ set &= ~(failed | ac->tried);
+
+ if (!set)
+ return false;
+
+ index = READ_ONCE(ac->alist->preferred);
+ if (test_bit(index, &set))
+ goto selected;
+
+ index = __ffs(set);
+
+selected:
+ ac->index = index;
+ set_bit(index, &ac->tried);
ac->responded = false;
- ac->addr = &ac->alist->addrs[ac->index];
return true;
}
@@ -340,53 +396,13 @@ int afs_end_cursor(struct afs_addr_cursor *ac)
alist = ac->alist;
if (alist) {
- if (ac->responded && ac->index != ac->start)
- WRITE_ONCE(alist->index, ac->index);
+ if (ac->responded &&
+ ac->index != alist->preferred &&
+ test_bit(ac->alist->preferred, &ac->tried))
+ WRITE_ONCE(alist->preferred, ac->index);
afs_put_addrlist(alist);
+ ac->alist = NULL;
}
- ac->addr = NULL;
- ac->alist = NULL;
- ac->begun = false;
return ac->error;
}
-
-/*
- * Set the address cursor for iterating over VL servers.
- */
-int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell)
-{
- struct afs_addr_list *alist;
- int ret;
-
- if (!rcu_access_pointer(cell->vl_addrs)) {
- ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
- TASK_INTERRUPTIBLE);
- if (ret < 0)
- return ret;
-
- if (!rcu_access_pointer(cell->vl_addrs) &&
- ktime_get_real_seconds() < cell->dns_expiry)
- return cell->error;
- }
-
- read_lock(&cell->vl_addrs_lock);
- alist = rcu_dereference_protected(cell->vl_addrs,
- lockdep_is_held(&cell->vl_addrs_lock));
- if (alist->nr_addrs > 0)
- afs_get_addrlist(alist);
- else
- alist = NULL;
- read_unlock(&cell->vl_addrs_lock);
-
- if (!alist)
- return -EDESTADDRREQ;
-
- ac->alist = alist;
- ac->addr = NULL;
- ac->start = READ_ONCE(alist->index);
- ac->index = ac->start;
- ac->error = 0;
- ac->begun = false;
- return 0;
-}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index b4ff1f7ae4ab..d12ffb457e47 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -23,9 +23,9 @@
#define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */
#define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */
-typedef unsigned afs_volid_t;
-typedef unsigned afs_vnodeid_t;
-typedef unsigned long long afs_dataversion_t;
+typedef u64 afs_volid_t;
+typedef u64 afs_vnodeid_t;
+typedef u64 afs_dataversion_t;
typedef enum {
AFSVL_RWVOL, /* read/write volume */
@@ -52,8 +52,9 @@ typedef enum {
*/
struct afs_fid {
afs_volid_t vid; /* volume ID */
- afs_vnodeid_t vnode; /* file index within volume */
- unsigned unique; /* unique ID number (file index version) */
+ afs_vnodeid_t vnode; /* Lower 64-bits of file index within volume */
+ u32 vnode_hi; /* Upper 32-bits of file index */
+ u32 unique; /* unique ID number (file index version) */
};
/*
@@ -67,14 +68,14 @@ typedef enum {
} afs_callback_type_t;
struct afs_callback {
+ time64_t expires_at; /* Time at which expires */
unsigned version; /* Callback version */
- unsigned expiry; /* Time at which expires */
afs_callback_type_t type; /* Type of callback */
};
struct afs_callback_break {
struct afs_fid fid; /* File identifier */
- struct afs_callback cb; /* Callback details */
+ //struct afs_callback cb; /* Callback details */
};
#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */
@@ -129,19 +130,18 @@ typedef u32 afs_access_t;
struct afs_file_status {
u64 size; /* file size */
afs_dataversion_t data_version; /* current data version */
- time_t mtime_client; /* last time client changed data */
- time_t mtime_server; /* last time server changed data */
- unsigned abort_code; /* Abort if bulk-fetching this failed */
-
- afs_file_type_t type; /* file type */
- unsigned nlink; /* link count */
- u32 author; /* author ID */
- u32 owner; /* owner ID */
- u32 group; /* group ID */
+ struct timespec64 mtime_client; /* Last time client changed data */
+ struct timespec64 mtime_server; /* Last time server changed data */
+ s64 author; /* author ID */
+ s64 owner; /* owner ID */
+ s64 group; /* group ID */
afs_access_t caller_access; /* access rights for authenticated caller */
afs_access_t anon_access; /* access rights for unauthenticated caller */
umode_t mode; /* UNIX mode */
+ afs_file_type_t type; /* file type */
+ u32 nlink; /* link count */
s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
+ u32 abort_code; /* Abort if bulk-fetching this failed */
};
/*
@@ -158,25 +158,27 @@ struct afs_file_status {
* AFS volume synchronisation information
*/
struct afs_volsync {
- time_t creation; /* volume creation time */
+ time64_t creation; /* volume creation time */
};
/*
* AFS volume status record
*/
struct afs_volume_status {
- u32 vid; /* volume ID */
- u32 parent_id; /* parent volume ID */
+ afs_volid_t vid; /* volume ID */
+ afs_volid_t parent_id; /* parent volume ID */
u8 online; /* true if volume currently online and available */
u8 in_service; /* true if volume currently in service */
u8 blessed; /* same as in_service */
u8 needs_salvage; /* true if consistency checking required */
u32 type; /* volume type (afs_voltype_t) */
- u32 min_quota; /* minimum space set aside (blocks) */
- u32 max_quota; /* maximum space this volume may occupy (blocks) */
- u32 blocks_in_use; /* space this volume currently occupies (blocks) */
- u32 part_blocks_avail; /* space available in volume's partition */
- u32 part_max_blocks; /* size of volume's partition */
+ u64 min_quota; /* minimum space set aside (blocks) */
+ u64 max_quota; /* maximum space this volume may occupy (blocks) */
+ u64 blocks_in_use; /* space this volume currently occupies (blocks) */
+ u64 part_blocks_avail; /* space available in volume's partition */
+ u64 part_max_blocks; /* size of volume's partition */
+ s64 vol_copy_date;
+ s64 vol_backup_date;
};
#define AFS_BLOCK_SIZE 1024
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index b1c31ec4523a..f6d0a21e8052 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -49,7 +49,7 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
struct afs_vnode *vnode = cookie_netfs_data;
struct afs_vnode_cache_aux aux;
- _enter("{%x,%x,%llx},%p,%u",
+ _enter("{%llx,%x,%llx},%p,%u",
vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
buffer, buflen);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 5f261fbf2182..1c7955f5cdaf 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -210,12 +210,10 @@ void afs_init_callback_state(struct afs_server *server)
/*
* actually break a callback
*/
-void afs_break_callback(struct afs_vnode *vnode)
+void __afs_break_callback(struct afs_vnode *vnode)
{
_enter("");
- write_seqlock(&vnode->cb_lock);
-
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
vnode->cb_break++;
@@ -230,7 +228,12 @@ void afs_break_callback(struct afs_vnode *vnode)
afs_lock_may_be_available(vnode);
spin_unlock(&vnode->lock);
}
+}
+void afs_break_callback(struct afs_vnode *vnode)
+{
+ write_seqlock(&vnode->cb_lock);
+ __afs_break_callback(vnode);
write_sequnlock(&vnode->cb_lock);
}
@@ -310,14 +313,10 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
/* TODO: Sort the callback break list by volume ID */
for (; count > 0; callbacks++, count--) {
- _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }",
+ _debug("- Fid { vl=%08llx n=%llu u=%u }",
callbacks->fid.vid,
callbacks->fid.vnode,
- callbacks->fid.unique,
- callbacks->cb.version,
- callbacks->cb.expiry,
- callbacks->cb.type
- );
+ callbacks->fid.unique);
afs_break_one_callback(server, &callbacks->fid);
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index f3d0bef16d78..cf445dbd5f2e 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,6 +20,8 @@
#include "internal.h"
static unsigned __read_mostly afs_cell_gc_delay = 10;
+static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
+static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
static void afs_manage_cell(struct work_struct *);
@@ -119,7 +121,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
*/
static struct afs_cell *afs_alloc_cell(struct afs_net *net,
const char *name, unsigned int namelen,
- const char *vllist)
+ const char *addresses)
{
struct afs_cell *cell;
int i, ret;
@@ -134,7 +136,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
if (namelen == 5 && memcmp(name, "@cell", 5) == 0)
return ERR_PTR(-EINVAL);
- _enter("%*.*s,%s", namelen, namelen, name, vllist);
+ _enter("%*.*s,%s", namelen, namelen, name, addresses);
cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL);
if (!cell) {
@@ -153,23 +155,26 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
(1 << AFS_CELL_FL_NO_LOOKUP_YET));
INIT_LIST_HEAD(&cell->proc_volumes);
rwlock_init(&cell->proc_lock);
- rwlock_init(&cell->vl_addrs_lock);
+ rwlock_init(&cell->vl_servers_lock);
/* Fill in the VL server list if we were given a list of addresses to
* use.
*/
- if (vllist) {
- struct afs_addr_list *alist;
-
- alist = afs_parse_text_addrs(vllist, strlen(vllist), ':',
- VL_SERVICE, AFS_VL_PORT);
- if (IS_ERR(alist)) {
- ret = PTR_ERR(alist);
+ if (addresses) {
+ struct afs_vlserver_list *vllist;
+
+ vllist = afs_parse_text_addrs(net,
+ addresses, strlen(addresses), ':',
+ VL_SERVICE, AFS_VL_PORT);
+ if (IS_ERR(vllist)) {
+ ret = PTR_ERR(vllist);
goto parse_failed;
}
- rcu_assign_pointer(cell->vl_addrs, alist);
+ rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = TIME64_MAX;
+ } else {
+ cell->dns_expiry = ktime_get_real_seconds();
}
_leave(" = %p", cell);
@@ -356,26 +361,40 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
*/
static void afs_update_cell(struct afs_cell *cell)
{
- struct afs_addr_list *alist, *old;
- time64_t now, expiry;
+ struct afs_vlserver_list *vllist, *old;
+ unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl);
+ unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl);
+ time64_t now, expiry = 0;
_enter("%s", cell->name);
- alist = afs_dns_query(cell, &expiry);
- if (IS_ERR(alist)) {
- switch (PTR_ERR(alist)) {
+ vllist = afs_dns_query(cell, &expiry);
+
+ now = ktime_get_real_seconds();
+ if (min_ttl > max_ttl)
+ max_ttl = min_ttl;
+ if (expiry < now + min_ttl)
+ expiry = now + min_ttl;
+ else if (expiry > now + max_ttl)
+ expiry = now + max_ttl;
+
+ if (IS_ERR(vllist)) {
+ switch (PTR_ERR(vllist)) {
case -ENODATA:
- /* The DNS said that the cell does not exist */
+ case -EDESTADDRREQ:
+ /* The DNS said that the cell does not exist or there
+ * weren't any addresses to be had.
+ */
set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
- cell->dns_expiry = ktime_get_real_seconds() + 61;
+ cell->dns_expiry = expiry;
break;
case -EAGAIN:
case -ECONNREFUSED:
default:
set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
- cell->dns_expiry = ktime_get_real_seconds() + 10;
+ cell->dns_expiry = now + 10;
break;
}
@@ -387,12 +406,12 @@ static void afs_update_cell(struct afs_cell *cell)
/* Exclusion on changing vl_addrs is achieved by a
* non-reentrant work item.
*/
- old = rcu_dereference_protected(cell->vl_addrs, true);
- rcu_assign_pointer(cell->vl_addrs, alist);
+ old = rcu_dereference_protected(cell->vl_servers, true);
+ rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = expiry;
if (old)
- afs_put_addrlist(old);
+ afs_put_vlserverlist(cell->net, old);
}
if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
@@ -414,7 +433,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
ASSERTCMP(atomic_read(&cell->usage), ==, 0);
- afs_put_addrlist(rcu_access_pointer(cell->vl_addrs));
+ afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
key_put(cell->anonymous_key);
kfree(cell);
@@ -514,6 +533,8 @@ static int afs_alloc_anon_key(struct afs_cell *cell)
*/
static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
{
+ struct hlist_node **p;
+ struct afs_cell *pcell;
int ret;
if (!cell->anonymous_key) {
@@ -534,7 +555,18 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
return ret;
mutex_lock(&net->proc_cells_lock);
- list_add_tail(&cell->proc_link, &net->proc_cells);
+ for (p = &net->proc_cells.first; *p; p = &(*p)->next) {
+ pcell = hlist_entry(*p, struct afs_cell, proc_link);
+ if (strcmp(cell->name, pcell->name) < 0)
+ break;
+ }
+
+ cell->proc_link.pprev = p;
+ cell->proc_link.next = *p;
+ rcu_assign_pointer(*p, &cell->proc_link.next);
+ if (cell->proc_link.next)
+ cell->proc_link.next->pprev = &cell->proc_link.next;
+
afs_dynroot_mkdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
return 0;
@@ -550,7 +582,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
afs_proc_cell_remove(cell);
mutex_lock(&net->proc_cells_lock);
- list_del_init(&cell->proc_link);
+ hlist_del_rcu(&cell->proc_link);
afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 9e51d6fe7e8f..8ee5972893ed 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -16,6 +16,7 @@
#include <linux/ip.h>
#include "internal.h"
#include "afs_cm.h"
+#include "protocol_yfs.h"
static int afs_deliver_cb_init_call_back_state(struct afs_call *);
static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
@@ -30,6 +31,8 @@ static void SRXAFSCB_Probe(struct work_struct *);
static void SRXAFSCB_ProbeUuid(struct work_struct *);
static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
+static int afs_deliver_yfs_cb_callback(struct afs_call *);
+
#define CM_NAME(name) \
const char afs_SRXCB##name##_name[] __tracepoint_string = \
"CB." #name
@@ -101,12 +104,25 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
};
/*
+ * YFS CB.CallBack operation type
+ */
+static CM_NAME(YFS_CallBack);
+static const struct afs_call_type afs_SRXYFSCB_CallBack = {
+ .name = afs_SRXCBYFS_CallBack_name,
+ .deliver = afs_deliver_yfs_cb_callback,
+ .destructor = afs_cm_destructor,
+ .work = SRXAFSCB_CallBack,
+};
+
+/*
* route an incoming cache manager call
* - return T if supported, F if not
*/
bool afs_cm_incoming_call(struct afs_call *call)
{
- _enter("{CB.OP %u}", call->operation_ID);
+ _enter("{%u, CB.OP %u}", call->service_id, call->operation_ID);
+
+ call->epoch = rxrpc_kernel_get_epoch(call->net->socket, call->rxcall);
switch (call->operation_ID) {
case CBCallBack:
@@ -127,12 +143,102 @@ bool afs_cm_incoming_call(struct afs_call *call)
case CBTellMeAboutYourself:
call->type = &afs_SRXCBTellMeAboutYourself;
return true;
+ case YFSCBCallBack:
+ if (call->service_id != YFS_CM_SERVICE)
+ return false;
+ call->type = &afs_SRXYFSCB_CallBack;
+ return true;
default:
return false;
}
}
/*
+ * Record a probe to the cache manager from a server.
+ */
+static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server)
+{
+ _enter("");
+
+ if (test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags) &&
+ !test_bit(AFS_SERVER_FL_PROBING, &server->flags)) {
+ if (server->cm_epoch == call->epoch)
+ return 0;
+
+ if (!server->probe.said_rebooted) {
+ pr_notice("kAFS: FS rebooted %pU\n", &server->uuid);
+ server->probe.said_rebooted = true;
+ }
+ }
+
+ spin_lock(&server->probe_lock);
+
+ if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
+ server->cm_epoch = call->epoch;
+ server->probe.cm_epoch = call->epoch;
+ goto out;
+ }
+
+ if (server->probe.cm_probed &&
+ call->epoch != server->probe.cm_epoch &&
+ !server->probe.said_inconsistent) {
+ pr_notice("kAFS: FS endpoints inconsistent %pU\n",
+ &server->uuid);
+ server->probe.said_inconsistent = true;
+ }
+
+ if (!server->probe.cm_probed || call->epoch == server->cm_epoch)
+ server->probe.cm_epoch = server->cm_epoch;
+
+out:
+ server->probe.cm_probed = true;
+ spin_unlock(&server->probe_lock);
+ return 0;
+}
+
+/*
+ * Find the server record by peer address and record a probe to the cache
+ * manager from a server.
+ */
+static int afs_find_cm_server_by_peer(struct afs_call *call)
+{
+ struct sockaddr_rxrpc srx;
+ struct afs_server *server;
+
+ rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+
+ server = afs_find_server(call->net, &srx);
+ if (!server) {
+ trace_afs_cm_no_server(call, &srx);
+ return 0;
+ }
+
+ call->cm_server = server;
+ return afs_record_cm_probe(call, server);
+}
+
+/*
+ * Find the server record by server UUID and record a probe to the cache
+ * manager from a server.
+ */
+static int afs_find_cm_server_by_uuid(struct afs_call *call,
+ struct afs_uuid *uuid)
+{
+ struct afs_server *server;
+
+ rcu_read_lock();
+ server = afs_find_server_by_uuid(call->net, call->request);
+ rcu_read_unlock();
+ if (!server) {
+ trace_afs_cm_no_server_u(call, call->request);
+ return 0;
+ }
+
+ call->cm_server = server;
+ return afs_record_cm_probe(call, server);
+}
+
+/*
* Clean up a cache manager call.
*/
static void afs_cm_destructor(struct afs_call *call)
@@ -168,7 +274,6 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
static int afs_deliver_cb_callback(struct afs_call *call)
{
struct afs_callback_break *cb;
- struct sockaddr_rxrpc srx;
__be32 *bp;
int ret, loop;
@@ -176,32 +281,32 @@ static int afs_deliver_cb_callback(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* extract the FID array and its count in two steps */
case 1:
_debug("extract FID count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("FID count: %u", call->count);
if (call->count > AFSCBMAX)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_cb_fid_count);
call->buffer = kmalloc(array3_size(call->count, 3, 4),
GFP_KERNEL);
if (!call->buffer)
return -ENOMEM;
- call->offset = 0;
+ afs_extract_to_buf(call, call->count * 3 * 4);
call->unmarshall++;
case 2:
_debug("extract FID array");
- ret = afs_extract_data(call, call->buffer,
- call->count * 3 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -218,59 +323,46 @@ static int afs_deliver_cb_callback(struct afs_call *call)
cb->fid.vid = ntohl(*bp++);
cb->fid.vnode = ntohl(*bp++);
cb->fid.unique = ntohl(*bp++);
- cb->cb.type = AFSCM_CB_UNTYPED;
}
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* extract the callback array and its count in two steps */
case 3:
_debug("extract CB count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count2 = ntohl(call->tmp);
_debug("CB count: %u", call->count2);
if (call->count2 != call->count && call->count2 != 0)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_cb_count);
+ call->_iter = &call->iter;
+ iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
case 4:
- _debug("extract CB array");
- ret = afs_extract_data(call, call->buffer,
- call->count2 * 3 * 4, false);
+ _debug("extract discard %zu/%u",
+ iov_iter_count(&call->iter), call->count2 * 3 * 4);
+
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
- _debug("unmarshall CB array");
- cb = call->request;
- bp = call->buffer;
- for (loop = call->count2; loop > 0; loop--, cb++) {
- cb->cb.version = ntohl(*bp++);
- cb->cb.expiry = ntohl(*bp++);
- cb->cb.type = ntohl(*bp++);
- }
-
- call->offset = 0;
call->unmarshall++;
case 5:
break;
}
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
+ return afs_io_error(call, afs_io_error_cm_reply);
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
- call->cm_server = afs_find_server(call->net, &srx);
- if (!call->cm_server)
- trace_afs_cm_no_server(call, &srx);
-
- return afs_queue_call_work(call);
+ return afs_find_cm_server_by_peer(call);
}
/*
@@ -294,24 +386,18 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
*/
static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
- struct sockaddr_rxrpc srx;
int ret;
_enter("");
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
-
- ret = afs_extract_data(call, NULL, 0, false);
+ afs_extract_discard(call, 0);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- call->cm_server = afs_find_server(call->net, &srx);
- if (!call->cm_server)
- trace_afs_cm_no_server(call, &srx);
-
- return afs_queue_call_work(call);
+ return afs_find_cm_server_by_peer(call);
}
/*
@@ -330,16 +416,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL);
if (!call->buffer)
return -ENOMEM;
+ afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
case 1:
_debug("extract UUID");
- ret = afs_extract_data(call, call->buffer,
- 11 * sizeof(__be32), false);
+ ret = afs_extract_data(call, false);
switch (ret) {
case 0: break;
case -EAGAIN: return 0;
@@ -362,7 +447,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
for (loop = 0; loop < 6; loop++)
r->node[loop] = ntohl(b[loop + 5]);
- call->offset = 0;
call->unmarshall++;
case 2:
@@ -370,17 +454,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
}
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
+ return afs_io_error(call, afs_io_error_cm_reply);
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- rcu_read_lock();
- call->cm_server = afs_find_server_by_uuid(call->net, call->request);
- rcu_read_unlock();
- if (!call->cm_server)
- trace_afs_cm_no_server_u(call, call->request);
-
- return afs_queue_call_work(call);
+ return afs_find_cm_server_by_uuid(call, call->request);
}
/*
@@ -405,14 +483,14 @@ static int afs_deliver_cb_probe(struct afs_call *call)
_enter("");
- ret = afs_extract_data(call, NULL, 0, false);
+ afs_extract_discard(call, 0);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
-
- return afs_queue_call_work(call);
+ return afs_io_error(call, afs_io_error_cm_reply);
+ return afs_find_cm_server_by_peer(call);
}
/*
@@ -453,16 +531,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL);
if (!call->buffer)
return -ENOMEM;
+ afs_extract_to_buf(call, 11 * sizeof(__be32));
call->unmarshall++;
case 1:
_debug("extract UUID");
- ret = afs_extract_data(call, call->buffer,
- 11 * sizeof(__be32), false);
+ ret = afs_extract_data(call, false);
switch (ret) {
case 0: break;
case -EAGAIN: return 0;
@@ -485,7 +562,6 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
for (loop = 0; loop < 6; loop++)
r->node[loop] = ntohl(b[loop + 5]);
- call->offset = 0;
call->unmarshall++;
case 2:
@@ -493,9 +569,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
}
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
-
- return afs_queue_call_work(call);
+ return afs_io_error(call, afs_io_error_cm_reply);
+ return afs_find_cm_server_by_uuid(call, call->request);
}
/*
@@ -570,12 +645,88 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
_enter("");
- ret = afs_extract_data(call, NULL, 0, false);
+ afs_extract_discard(call, 0);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
- return -EIO;
+ return afs_io_error(call, afs_io_error_cm_reply);
+ return afs_find_cm_server_by_peer(call);
+}
+
+/*
+ * deliver request data to a YFS CB.CallBack call
+ */
+static int afs_deliver_yfs_cb_callback(struct afs_call *call)
+{
+ struct afs_callback_break *cb;
+ struct yfs_xdr_YFSFid *bp;
+ size_t size;
+ int ret, loop;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* extract the FID array and its count in two steps */
+ case 1:
+ _debug("extract FID count");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("FID count: %u", call->count);
+ if (call->count > YFSCBMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_cb_fid_count);
+
+ size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid));
+ call->buffer = kmalloc(size, GFP_KERNEL);
+ if (!call->buffer)
+ return -ENOMEM;
+ afs_extract_to_buf(call, size);
+ call->unmarshall++;
+
+ case 2:
+ _debug("extract FID array");
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ _debug("unmarshall FID array");
+ call->request = kcalloc(call->count,
+ sizeof(struct afs_callback_break),
+ GFP_KERNEL);
+ if (!call->request)
+ return -ENOMEM;
+
+ cb = call->request;
+ bp = call->buffer;
+ for (loop = call->count; loop > 0; loop--, cb++) {
+ cb->fid.vid = xdr_to_u64(bp->volume);
+ cb->fid.vnode = xdr_to_u64(bp->vnode.lo);
+ cb->fid.vnode_hi = ntohl(bp->vnode.hi);
+ cb->fid.unique = ntohl(bp->vnode.unique);
+ bp++;
+ }
+
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ case 3:
+ break;
+ }
+
+ if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+ return afs_io_error(call, afs_io_error_cm_reply);
- return afs_queue_call_work(call);
+ /* We'll need the file server record as that tells us which set of
+ * vnodes to operate upon.
+ */
+ return afs_find_cm_server_by_peer(call);
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 855bf2b79fed..43dea3b00c29 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -138,6 +138,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
ntohs(dbuf->blocks[tmp].hdr.magic));
trace_afs_dir_check_failed(dvnode, off, i_size);
kunmap(page);
+ trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic);
goto error;
}
@@ -190,9 +191,11 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
retry:
i_size = i_size_read(&dvnode->vfs_inode);
if (i_size < 2048)
- return ERR_PTR(-EIO);
- if (i_size > 2048 * 1024)
+ return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small));
+ if (i_size > 2048 * 1024) {
+ trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
return ERR_PTR(-EFBIG);
+ }
_enter("%llu", i_size);
@@ -315,7 +318,8 @@ content_has_grown:
/*
* deal with one block in an AFS directory
*/
-static int afs_dir_iterate_block(struct dir_context *ctx,
+static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+ struct dir_context *ctx,
union afs_xdr_dir_block *block,
unsigned blkoff)
{
@@ -365,7 +369,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
" (len %u/%zu)",
blkoff / sizeof(union afs_xdr_dir_block),
offset, next, tmp, nlen);
- return -EIO;
+ return afs_bad(dvnode, afs_file_error_dir_over_end);
}
if (!(block->hdr.bitmap[next / 8] &
(1 << (next % 8)))) {
@@ -373,7 +377,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
" %u unmarked extension (len %u/%zu)",
blkoff / sizeof(union afs_xdr_dir_block),
offset, next, tmp, nlen);
- return -EIO;
+ return afs_bad(dvnode, afs_file_error_dir_unmarked_ext);
}
_debug("ENT[%zu.%u]: ext %u/%zu",
@@ -442,7 +446,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
*/
page = req->pages[blkoff / PAGE_SIZE];
if (!page) {
- ret = -EIO;
+ ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
mark_page_accessed(page);
@@ -455,7 +459,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
do {
dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
sizeof(union afs_xdr_dir_block)];
- ret = afs_dir_iterate_block(ctx, dblock, blkoff);
+ ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff);
if (ret != 1) {
kunmap(page);
goto out;
@@ -548,7 +552,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
}
*fid = cookie.fid;
- _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+ _leave(" = 0 { vn=%llu u=%u }", fid->vnode, fid->unique);
return 0;
}
@@ -826,7 +830,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
struct key *key;
int ret;
- _enter("{%x:%u},%p{%pd},",
+ _enter("{%llx:%llu},%p{%pd},",
dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry);
ASSERTCMP(d_inode(dentry), ==, NULL);
@@ -896,7 +900,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (d_really_is_positive(dentry)) {
vnode = AFS_FS_I(d_inode(dentry));
- _enter("{v={%x:%u} n=%pd fl=%lx},",
+ _enter("{v={%llx:%llu} n=%pd fl=%lx},",
vnode->fid.vid, vnode->fid.vnode, dentry,
vnode->flags);
} else {
@@ -965,7 +969,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
/* if the vnode ID has changed, then the dirent points to a
* different file */
if (fid.vnode != vnode->fid.vnode) {
- _debug("%pd: dirent changed [%u != %u]",
+ _debug("%pd: dirent changed [%llu != %llu]",
dentry, fid.vnode,
vnode->fid.vnode);
goto not_found;
@@ -1085,6 +1089,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
vnode = AFS_FS_I(inode);
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+ afs_vnode_commit_status(fc, vnode, 0);
d_add(new_dentry, inode);
}
@@ -1104,7 +1109,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
mode |= S_IFDIR;
- _enter("{%x:%u},{%pd},%ho",
+ _enter("{%llx:%llu},{%pd},%ho",
dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
key = afs_request_key(dvnode->volume->cell);
@@ -1169,12 +1174,12 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
static int afs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct afs_fs_cursor fc;
- struct afs_vnode *dvnode = AFS_FS_I(dir);
+ struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
u64 data_version = dvnode->status.data_version;
int ret;
- _enter("{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%pd}",
dvnode->fid.vid, dvnode->fid.vnode, dentry);
key = afs_request_key(dvnode->volume->cell);
@@ -1183,11 +1188,19 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
goto error;
}
+ /* Try to make sure we have a callback promise on the victim. */
+ if (d_really_is_positive(dentry)) {
+ vnode = AFS_FS_I(d_inode(dentry));
+ ret = afs_validate(vnode, key);
+ if (ret < 0)
+ goto error_key;
+ }
+
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
- afs_fs_remove(&fc, dentry->d_name.name, true,
+ afs_fs_remove(&fc, vnode, dentry->d_name.name, true,
data_version);
}
@@ -1201,6 +1214,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
}
}
+error_key:
key_put(key);
error:
return ret;
@@ -1231,7 +1245,9 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- if (dir_valid) {
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ /* Already done */
+ } else if (dir_valid) {
drop_nlink(&vnode->vfs_inode);
if (vnode->vfs_inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
@@ -1260,13 +1276,13 @@ static int afs_dir_remove_link(struct dentry *dentry, struct key *key,
static int afs_unlink(struct inode *dir, struct dentry *dentry)
{
struct afs_fs_cursor fc;
- struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
+ struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
unsigned long d_version = (unsigned long)dentry->d_fsdata;
u64 data_version = dvnode->status.data_version;
int ret;
- _enter("{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%pd}",
dvnode->fid.vid, dvnode->fid.vnode, dentry);
if (dentry->d_name.len >= AFSNAMEMAX)
@@ -1290,7 +1306,18 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
- afs_fs_remove(&fc, dentry->d_name.name, false,
+
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) &&
+ !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) {
+ yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name,
+ data_version);
+ if (fc.ac.error != -ECONNABORTED ||
+ fc.ac.abort_code != RXGEN_OPCODE)
+ continue;
+ set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags);
+ }
+
+ afs_fs_remove(&fc, vnode, dentry->d_name.name, false,
data_version);
}
@@ -1330,7 +1357,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
mode |= S_IFREG;
- _enter("{%x:%u},{%pd},%ho,",
+ _enter("{%llx:%llu},{%pd},%ho,",
dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
ret = -ENAMETOOLONG;
@@ -1393,7 +1420,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
dvnode = AFS_FS_I(dir);
data_version = dvnode->status.data_version;
- _enter("{%x:%u},{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%llx:%llu},{%pd}",
vnode->fid.vid, vnode->fid.vnode,
dvnode->fid.vid, dvnode->fid.vnode,
dentry);
@@ -1464,7 +1491,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
u64 data_version = dvnode->status.data_version;
int ret;
- _enter("{%x:%u},{%pd},%s",
+ _enter("{%llx:%llu},{%pd},%s",
dvnode->fid.vid, dvnode->fid.vnode, dentry,
content);
@@ -1540,7 +1567,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
orig_data_version = orig_dvnode->status.data_version;
new_data_version = new_dvnode->status.data_version;
- _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}",
+ _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
vnode->fid.vid, vnode->fid.vnode,
new_dvnode->fid.vid, new_dvnode->fid.vnode,
@@ -1607,7 +1634,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)
{
struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
- _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
set_page_private(page, 0);
ClearPagePrivate(page);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 1cde710a8013..a9ba81ddf154 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -46,7 +46,7 @@ static int afs_probe_cell_name(struct dentry *dentry)
return 0;
}
- ret = dns_query("afsdb", name, len, "", NULL, NULL);
+ ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL);
if (ret == -ENODATA)
ret = -EDESTADDRREQ;
return ret;
@@ -62,7 +62,7 @@ struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
struct inode *inode;
int ret = -ENOENT;
- _enter("%p{%pd}, {%x:%u}",
+ _enter("%p{%pd}, {%llx:%llu}",
dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
@@ -265,7 +265,7 @@ int afs_dynroot_populate(struct super_block *sb)
return -ERESTARTSYS;
net->dynroot_sb = sb;
- list_for_each_entry(cell, &net->proc_cells, proc_link) {
+ hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
ret = afs_dynroot_mkdir(net, cell);
if (ret < 0)
goto error;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7d4f26198573..d6bc3f5d784b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,7 +121,7 @@ int afs_open(struct inode *inode, struct file *file)
struct key *key;
int ret;
- _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key)) {
@@ -170,7 +170,7 @@ int afs_release(struct inode *inode, struct file *file)
struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = file->private_data;
- _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
if ((file->f_mode & FMODE_WRITE))
return vfs_fsync(file, 0);
@@ -228,7 +228,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x,,,",
+ _enter("%s{%llx:%llu.%u},%x,,,",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -634,7 +634,7 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
unsigned long priv;
- _enter("{{%x:%u}[%lu],%lx},%x",
+ _enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
gfp_flags);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index dc62d15a964b..0568fd986821 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -29,7 +29,7 @@ static const struct file_lock_operations afs_lock_ops = {
*/
void afs_lock_may_be_available(struct afs_vnode *vnode)
{
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
}
@@ -76,7 +76,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x,%u",
+ _enter("%s{%llx:%llu.%u},%x,%u",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -107,7 +107,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x",
+ _enter("%s{%llx:%llu.%u},%x",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -138,7 +138,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
struct afs_fs_cursor fc;
int ret;
- _enter("%s{%x:%u.%u},%x",
+ _enter("%s{%llx:%llu.%u},%x",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -175,7 +175,7 @@ void afs_lock_work(struct work_struct *work)
struct key *key;
int ret;
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
spin_lock(&vnode->lock);
@@ -192,7 +192,7 @@ again:
ret = afs_release_lock(vnode, vnode->lock_key);
if (ret < 0)
printk(KERN_WARNING "AFS:"
- " Failed to release lock on {%x:%x} error %d\n",
+ " Failed to release lock on {%llx:%llx} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
spin_lock(&vnode->lock);
@@ -229,7 +229,7 @@ again:
key_put(key);
if (ret < 0)
- pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n",
+ pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
spin_lock(&vnode->lock);
@@ -430,7 +430,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
struct key *key = afs_file_key(file);
int ret;
- _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+ _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
/* only whole-file locks are supported */
if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
@@ -582,7 +582,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
int ret;
- _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+ _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
/* Flush all pending writes before doing anything with locks. */
vfs_fsync(file, 0);
@@ -639,7 +639,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
- _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+ _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
vnode->fid.vid, vnode->fid.vnode, cmd,
fl->fl_type, fl->fl_flags,
(long long) fl->fl_start, (long long) fl->fl_end);
@@ -662,7 +662,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
- _enter("{%x:%u},%d,{t=%x,fl=%x}",
+ _enter("{%llx:%llu},%d,{t=%x,fl=%x}",
vnode->fid.vid, vnode->fid.vnode, cmd,
fl->fl_type, fl->fl_flags);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
new file mode 100644
index 000000000000..d049cb459742
--- /dev/null
+++ b/fs/afs/fs_probe.c
@@ -0,0 +1,270 @@
+/* AFS fileserver probing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "afs_fs.h"
+#include "internal.h"
+#include "protocol_yfs.h"
+
+static bool afs_fs_probe_done(struct afs_server *server)
+{
+ if (!atomic_dec_and_test(&server->probe_outstanding))
+ return false;
+
+ wake_up_var(&server->probe_outstanding);
+ clear_bit_unlock(AFS_SERVER_FL_PROBING, &server->flags);
+ wake_up_bit(&server->flags, AFS_SERVER_FL_PROBING);
+ return true;
+}
+
+/*
+ * Process the result of probing a fileserver. This is called after successful
+ * or failed delivery of an FS.GetCapabilities operation.
+ */
+void afs_fileserver_probe_result(struct afs_call *call)
+{
+ struct afs_addr_list *alist = call->alist;
+ struct afs_server *server = call->reply[0];
+ unsigned int server_index = (long)call->reply[1];
+ unsigned int index = call->addr_ix;
+ unsigned int rtt = UINT_MAX;
+ bool have_result = false;
+ u64 _rtt;
+ int ret = call->error;
+
+ _enter("%pU,%u", &server->uuid, index);
+
+ spin_lock(&server->probe_lock);
+
+ switch (ret) {
+ case 0:
+ server->probe.error = 0;
+ goto responded;
+ case -ECONNABORTED:
+ if (!server->probe.responded) {
+ server->probe.abort_code = call->abort_code;
+ server->probe.error = ret;
+ }
+ goto responded;
+ case -ENOMEM:
+ case -ENONET:
+ server->probe.local_failure = true;
+ afs_io_error(call, afs_io_error_fs_probe_fail);
+ goto out;
+ case -ECONNRESET: /* Responded, but call expired. */
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ETIME:
+ default:
+ clear_bit(index, &alist->responded);
+ set_bit(index, &alist->failed);
+ if (!server->probe.responded &&
+ (server->probe.error == 0 ||
+ server->probe.error == -ETIMEDOUT ||
+ server->probe.error == -ETIME))
+ server->probe.error = ret;
+ afs_io_error(call, afs_io_error_fs_probe_fail);
+ goto out;
+ }
+
+responded:
+ set_bit(index, &alist->responded);
+ clear_bit(index, &alist->failed);
+
+ if (call->service_id == YFS_FS_SERVICE) {
+ server->probe.is_yfs = true;
+ set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ } else {
+ server->probe.not_yfs = true;
+ if (!server->probe.is_yfs) {
+ clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ }
+ }
+
+ /* Get the RTT and scale it to fit into a 32-bit value that represents
+ * over a minute of time so that we can access it with one instruction
+ * on a 32-bit system.
+ */
+ _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+ _rtt /= 64;
+ rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
+ if (rtt < server->probe.rtt) {
+ server->probe.rtt = rtt;
+ alist->preferred = index;
+ have_result = true;
+ }
+
+ smp_wmb(); /* Set rtt before responded. */
+ server->probe.responded = true;
+ set_bit(AFS_SERVER_FL_PROBED, &server->flags);
+out:
+ spin_unlock(&server->probe_lock);
+
+ _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+ server_index, index, &alist->addrs[index].transport,
+ (unsigned int)rtt, ret);
+
+ have_result |= afs_fs_probe_done(server);
+ if (have_result) {
+ server->probe.have_result = true;
+ wake_up_var(&server->probe.have_result);
+ wake_up_all(&server->probe_wq);
+ }
+}
+
+/*
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
+ */
+static int afs_do_probe_fileserver(struct afs_net *net,
+ struct afs_server *server,
+ struct key *key,
+ unsigned int server_index)
+{
+ struct afs_addr_cursor ac = {
+ .index = 0,
+ };
+ int ret;
+
+ _enter("%pU", &server->uuid);
+
+ read_lock(&server->fs_lock);
+ ac.alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->fs_lock));
+ read_unlock(&server->fs_lock);
+
+ atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ memset(&server->probe, 0, sizeof(server->probe));
+ server->probe.rtt = UINT_MAX;
+
+ for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
+ ret = afs_fs_get_capabilities(net, server, &ac, key, server_index,
+ true);
+ if (ret != -EINPROGRESS) {
+ afs_fs_probe_done(server);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Send off probes to all unprobed servers.
+ */
+int afs_probe_fileservers(struct afs_net *net, struct key *key,
+ struct afs_server_list *list)
+{
+ struct afs_server *server;
+ int i, ret;
+
+ for (i = 0; i < list->nr_servers; i++) {
+ server = list->servers[i].server;
+ if (test_bit(AFS_SERVER_FL_PROBED, &server->flags))
+ continue;
+
+ if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &server->flags)) {
+ ret = afs_do_probe_fileserver(net, server, key, i);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Wait for the first as-yet untried fileserver to respond.
+ */
+int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
+{
+ struct wait_queue_entry *waits;
+ struct afs_server *server;
+ unsigned int rtt = UINT_MAX;
+ bool have_responders = false;
+ int pref = -1, i;
+
+ _enter("%u,%lx", slist->nr_servers, untried);
+
+ /* Only wait for servers that have a probe outstanding. */
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ if (!test_bit(AFS_SERVER_FL_PROBING, &server->flags))
+ __clear_bit(i, &untried);
+ if (server->probe.responded)
+ have_responders = true;
+ }
+ }
+ if (have_responders || !untried)
+ return 0;
+
+ waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
+ if (!waits)
+ return -ENOMEM;
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ init_waitqueue_entry(&waits[i], current);
+ add_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ for (;;) {
+ bool still_probing = false;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ if (server->probe.responded)
+ goto stop;
+ if (test_bit(AFS_SERVER_FL_PROBING, &server->flags))
+ still_probing = true;
+ }
+ }
+
+ if (!still_probing || unlikely(signal_pending(current)))
+ goto stop;
+ schedule();
+ }
+
+stop:
+ set_current_state(TASK_RUNNING);
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = slist->servers[i].server;
+ if (server->probe.responded &&
+ server->probe.rtt < rtt) {
+ pref = i;
+ rtt = server->probe.rtt;
+ }
+
+ remove_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ kfree(waits);
+
+ if (pref == -1 && signal_pending(current))
+ return -ERESTARTSYS;
+
+ if (pref >= 0)
+ slist->preferred = pref;
+ return 0;
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 50929cb91732..ca08c83168f5 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -17,15 +17,10 @@
#include "internal.h"
#include "afs_fs.h"
#include "xdr_fs.h"
+#include "protocol_yfs.h"
static const struct afs_fid afs_zero_fid;
-/*
- * We need somewhere to discard into in case the server helpfully returns more
- * than we asked for in FS.FetchData{,64}.
- */
-static u8 afs_discard_buffer[64];
-
static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
{
call->cbi = afs_get_cb_interest(cbi);
@@ -75,8 +70,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode,
struct timespec64 t;
umode_t mode;
- t.tv_sec = status->mtime_client;
- t.tv_nsec = 0;
+ t = status->mtime_client;
vnode->vfs_inode.i_ctime = t;
vnode->vfs_inode.i_mtime = t;
vnode->vfs_inode.i_atime = t;
@@ -96,7 +90,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode,
if (!(flags & AFS_VNODE_NOT_YET_SET)) {
if (expected_version &&
*expected_version != status->data_version) {
- _debug("vnode modified %llx on {%x:%u} [exp %llx]",
+ _debug("vnode modified %llx on {%llx:%llu} [exp %llx]",
(unsigned long long) status->data_version,
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long) *expected_version);
@@ -170,7 +164,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
if (type != status->type &&
vnode &&
!test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
- pr_warning("Vnode %x:%x:%x changed type %u to %u\n",
+ pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
@@ -200,8 +194,10 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
EXTRACT_M(mode);
EXTRACT_M(group);
- status->mtime_client = ntohl(xdr->mtime_client);
- status->mtime_server = ntohl(xdr->mtime_server);
+ status->mtime_client.tv_sec = ntohl(xdr->mtime_client);
+ status->mtime_client.tv_nsec = 0;
+ status->mtime_server.tv_sec = ntohl(xdr->mtime_server);
+ status->mtime_server.tv_nsec = 0;
status->lock_count = ntohl(xdr->lock_count);
size = (u64)ntohl(xdr->size_lo);
@@ -233,7 +229,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
}
/*
@@ -273,7 +269,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
write_seqlock(&vnode->cb_lock);
- if (call->cb_break == afs_cb_break_sum(vnode, cbi)) {
+ if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) {
vnode->cb_version = ntohl(*bp++);
cb_expiry = ntohl(*bp++);
vnode->cb_type = ntohl(*bp++);
@@ -293,13 +289,19 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
*_bp = bp;
}
-static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
+static ktime_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
+{
+ return ktime_add_ns(call->reply_time, expiry * NSEC_PER_SEC);
+}
+
+static void xdr_decode_AFSCallBack_raw(struct afs_call *call,
+ const __be32 **_bp,
struct afs_callback *cb)
{
const __be32 *bp = *_bp;
cb->version = ntohl(*bp++);
- cb->expiry = ntohl(*bp++);
+ cb->expires_at = xdr_decode_expiry(call, ntohl(*bp++));
cb->type = ntohl(*bp++);
*_bp = bp;
}
@@ -311,14 +313,18 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp,
struct afs_volsync *volsync)
{
const __be32 *bp = *_bp;
+ u32 creation;
- volsync->creation = ntohl(*bp++);
+ creation = ntohl(*bp++);
bp++; /* spare2 */
bp++; /* spare3 */
bp++; /* spare4 */
bp++; /* spare5 */
bp++; /* spare6 */
*_bp = bp;
+
+ if (volsync)
+ volsync->creation = creation;
}
/*
@@ -379,6 +385,8 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
vs->blocks_in_use = ntohl(*bp++);
vs->part_blocks_avail = ntohl(*bp++);
vs->part_max_blocks = ntohl(*bp++);
+ vs->vol_copy_date = 0;
+ vs->vol_backup_date = 0;
*_bp = bp;
}
@@ -395,16 +403,16 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
if (ret < 0)
return ret;
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
xdr_decode_AFSCallBack(call, vnode, &bp);
- if (call->reply[1])
- xdr_decode_AFSVolSync(&bp, call->reply[1]);
+ xdr_decode_AFSVolSync(&bp, call->reply[1]);
_leave(" = 0 [done]");
return 0;
@@ -431,7 +439,10 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_fetch_file_status(fc, volsync, new_inode);
+
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode,
@@ -445,6 +456,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
call->reply[0] = vnode;
call->reply[1] = volsync;
call->expected_version = new_inode ? 1 : vnode->status.data_version;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -468,139 +480,117 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_read *req = call->reply[2];
const __be32 *bp;
unsigned int size;
- void *buffer;
int ret;
- _enter("{%u,%zu/%u;%llu/%llu}",
- call->unmarshall, call->offset, call->count,
- req->remain, req->actual_len);
+ _enter("{%u,%zu/%llu}",
+ call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
switch (call->unmarshall) {
case 0:
req->actual_len = 0;
- call->offset = 0;
+ req->index = 0;
+ req->offset = req->pos & (PAGE_SIZE - 1);
call->unmarshall++;
- if (call->operation_ID != FSFETCHDATA64) {
- call->unmarshall++;
- goto no_msw;
+ if (call->operation_ID == FSFETCHDATA64) {
+ afs_extract_to_tmp64(call);
+ } else {
+ call->tmp_u = htonl(0);
+ afs_extract_to_tmp(call);
}
- /* extract the upper part of the returned data length of an
- * FSFETCHDATA64 op (which should always be 0 using this
- * client) */
- case 1:
- _debug("extract data length (MSW)");
- ret = afs_extract_data(call, &call->tmp, 4, true);
- if (ret < 0)
- return ret;
-
- req->actual_len = ntohl(call->tmp);
- req->actual_len <<= 32;
- call->offset = 0;
- call->unmarshall++;
-
- no_msw:
/* extract the returned data length */
- case 2:
+ case 1:
_debug("extract data length");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- req->actual_len |= ntohl(call->tmp);
+ req->actual_len = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len);
-
- req->remain = req->actual_len;
- call->offset = req->pos & (PAGE_SIZE - 1);
- req->index = 0;
- if (req->actual_len == 0)
+ req->remain = min(req->len, req->actual_len);
+ if (req->remain == 0)
goto no_more_data;
+
call->unmarshall++;
begin_page:
ASSERTCMP(req->index, <, req->nr_pages);
- if (req->remain > PAGE_SIZE - call->offset)
- size = PAGE_SIZE - call->offset;
+ if (req->remain > PAGE_SIZE - req->offset)
+ size = PAGE_SIZE - req->offset;
else
size = req->remain;
- call->count = call->offset + size;
- ASSERTCMP(call->count, <=, PAGE_SIZE);
- req->remain -= size;
+ call->bvec[0].bv_len = size;
+ call->bvec[0].bv_offset = req->offset;
+ call->bvec[0].bv_page = req->pages[req->index];
+ iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ ASSERTCMP(size, <=, PAGE_SIZE);
/* extract the returned data */
- case 3:
- _debug("extract data %llu/%llu %zu/%u",
- req->remain, req->actual_len, call->offset, call->count);
+ case 2:
+ _debug("extract data %zu/%llu",
+ iov_iter_count(&call->iter), req->remain);
- buffer = kmap(req->pages[req->index]);
- ret = afs_extract_data(call, buffer, call->count, true);
- kunmap(req->pages[req->index]);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- if (call->offset == PAGE_SIZE) {
+ req->remain -= call->bvec[0].bv_len;
+ req->offset += call->bvec[0].bv_len;
+ ASSERTCMP(req->offset, <=, PAGE_SIZE);
+ if (req->offset == PAGE_SIZE) {
+ req->offset = 0;
if (req->page_done)
req->page_done(call, req);
req->index++;
- if (req->remain > 0) {
- call->offset = 0;
- if (req->index >= req->nr_pages) {
- call->unmarshall = 4;
- goto begin_discard;
- }
+ if (req->remain > 0)
goto begin_page;
- }
}
- goto no_more_data;
+
+ ASSERTCMP(req->remain, ==, 0);
+ if (req->actual_len <= req->len)
+ goto no_more_data;
/* Discard any excess data the server gave us */
- begin_discard:
- case 4:
- size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain);
- call->count = size;
- _debug("extract discard %llu/%llu %zu/%u",
- req->remain, req->actual_len, call->offset, call->count);
-
- call->offset = 0;
- ret = afs_extract_data(call, afs_discard_buffer, call->count, true);
- req->remain -= call->offset;
+ iov_iter_discard(&call->iter, READ, req->actual_len - req->len);
+ call->unmarshall = 3;
+ case 3:
+ _debug("extract discard %zu/%llu",
+ iov_iter_count(&call->iter), req->actual_len - req->len);
+
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- if (req->remain > 0)
- goto begin_discard;
no_more_data:
- call->offset = 0;
- call->unmarshall = 5;
+ call->unmarshall = 4;
+ afs_extract_to_buf(call, (21 + 3 + 6) * 4);
/* extract the metadata */
- case 5:
- ret = afs_extract_data(call, call->buffer,
- (21 + 3 + 6) * 4, false);
+ case 4:
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &vnode->status.data_version, req) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &vnode->status.data_version, req);
+ if (ret < 0)
+ return ret;
xdr_decode_AFSCallBack(call, vnode, &bp);
- if (call->reply[1])
- xdr_decode_AFSVolSync(&bp, call->reply[1]);
+ xdr_decode_AFSVolSync(&bp, call->reply[1]);
- call->offset = 0;
call->unmarshall++;
- case 6:
+ case 5:
break;
}
for (; req->index < req->nr_pages; req->index++) {
- if (call->count < PAGE_SIZE)
+ if (req->offset < PAGE_SIZE)
zero_user_segment(req->pages[req->index],
- call->count, PAGE_SIZE);
+ req->offset, PAGE_SIZE);
if (req->page_done)
req->page_done(call, req);
- call->count = 0;
+ req->offset = 0;
}
_leave(" = 0 [done]");
@@ -653,6 +643,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
call->reply[1] = NULL; /* volsync */
call->reply[2] = req;
call->expected_version = vnode->status.data_version;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -682,6 +673,9 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_fetch_data(fc, req);
+
if (upper_32_bits(req->pos) ||
upper_32_bits(req->len) ||
upper_32_bits(req->pos + req->len))
@@ -698,6 +692,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
call->reply[1] = NULL; /* volsync */
call->reply[2] = req;
call->expected_version = vnode->status.data_version;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -733,11 +728,14 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply[1]);
- if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 ||
- afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
- xdr_decode_AFSCallBack_raw(&bp, call->reply[3]);
+ ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_AFSCallBack_raw(call, &bp, call->reply[3]);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -778,6 +776,15 @@ int afs_fs_create(struct afs_fs_cursor *fc,
size_t namesz, reqsz, padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){
+ if (S_ISDIR(mode))
+ return yfs_fs_make_dir(fc, name, mode, current_data_version,
+ newfid, newstatus, newcb);
+ else
+ return yfs_fs_create_file(fc, name, mode, current_data_version,
+ newfid, newstatus, newcb);
+ }
+
_enter("");
namesz = strlen(name);
@@ -796,6 +803,7 @@ int afs_fs_create(struct afs_fs_cursor *fc,
call->reply[2] = newstatus;
call->reply[3] = newcb;
call->expected_version = current_data_version + 1;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -839,9 +847,10 @@ static int afs_deliver_fs_remove(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -868,15 +877,18 @@ static const struct afs_call_type afs_RXFSRemoveDir = {
/*
* remove a file or directory
*/
-int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
- u64 current_data_version)
+int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, bool isdir, u64 current_data_version)
{
- struct afs_vnode *vnode = fc->vnode;
+ struct afs_vnode *dvnode = fc->vnode;
struct afs_call *call;
- struct afs_net *net = afs_v2net(vnode);
+ struct afs_net *net = afs_v2net(dvnode);
size_t namesz, reqsz, padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_remove(fc, vnode, name, isdir, current_data_version);
+
_enter("");
namesz = strlen(name);
@@ -890,15 +902,16 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
return -ENOMEM;
call->key = fc->key;
- call->reply[0] = vnode;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
call->expected_version = current_data_version + 1;
/* marshall the parameters */
bp = call->request;
*bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
- *bp++ = htonl(vnode->fid.vid);
- *bp++ = htonl(vnode->fid.vnode);
- *bp++ = htonl(vnode->fid.unique);
+ *bp++ = htonl(dvnode->fid.vid);
+ *bp++ = htonl(dvnode->fid.vnode);
+ *bp++ = htonl(dvnode->fid.unique);
*bp++ = htonl(namesz);
memcpy(bp, name, namesz);
bp = (void *) bp + namesz;
@@ -908,7 +921,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
}
afs_use_fs_server(call, fc->cbi);
- trace_afs_make_fs_call(call, &vnode->fid);
+ trace_afs_make_fs_call(call, &dvnode->fid);
return afs_make_call(&fc->ac, call, GFP_NOFS, false);
}
@@ -929,10 +942,13 @@ static int afs_deliver_fs_link(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 ||
- afs_decode_status(call, &bp, &dvnode->status, dvnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = afs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -961,6 +977,9 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
size_t namesz, reqsz, padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_link(fc, vnode, name, current_data_version);
+
_enter("");
namesz = strlen(name);
@@ -1016,10 +1035,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply[1]);
- if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) ||
- afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1052,6 +1074,10 @@ int afs_fs_symlink(struct afs_fs_cursor *fc,
size_t namesz, reqsz, padsz, c_namesz, c_padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_symlink(fc, name, contents, current_data_version,
+ newfid, newstatus);
+
_enter("");
namesz = strlen(name);
@@ -1122,13 +1148,16 @@ static int afs_deliver_fs_rename(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
- if (new_dvnode != orig_dvnode &&
- afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
- &call->expected_version_2, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ if (new_dvnode != orig_dvnode) {
+ ret = afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
+ &call->expected_version_2, NULL);
+ if (ret < 0)
+ return ret;
+ }
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1161,6 +1190,12 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_rename(fc, orig_name,
+ new_dvnode, new_name,
+ current_orig_data_version,
+ current_new_data_version);
+
_enter("");
o_namesz = strlen(orig_name);
@@ -1231,9 +1266,10 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
afs_pages_written_back(vnode, call);
@@ -1273,7 +1309,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(net, &afs_RXFSStoreData64,
@@ -1330,7 +1366,10 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
loff_t size, pos, i_size;
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_store_data(fc, mapping, first, last, offset, to);
+
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
size = (loff_t)to - (loff_t)offset;
@@ -1407,9 +1446,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- if (afs_decode_status(call, &bp, &vnode->status, vnode,
- &call->expected_version, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1451,7 +1491,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
ASSERT(attr->ia_valid & ATTR_SIZE);
@@ -1498,7 +1538,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
ASSERT(attr->ia_valid & ATTR_SIZE);
@@ -1544,10 +1584,13 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_setattr(fc, attr);
+
if (attr->ia_valid & ATTR_SIZE)
return afs_fs_setattr_size(fc, attr);
- _enter(",%x,{%x:%u},,",
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus,
@@ -1581,164 +1624,114 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
{
const __be32 *bp;
char *p;
+ u32 size;
int ret;
_enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
case 0:
- call->offset = 0;
call->unmarshall++;
+ afs_extract_to_buf(call, 12 * 4);
/* extract the returned status record */
case 1:
_debug("extract status");
- ret = afs_extract_data(call, call->buffer,
- 12 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
bp = call->buffer;
xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]);
- call->offset = 0;
call->unmarshall++;
+ afs_extract_to_tmp(call);
/* extract the volume name length */
case 2:
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("volname length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_volname_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
call->unmarshall++;
/* extract the volume name */
case 3:
_debug("extract volname");
- if (call->count > 0) {
- ret = afs_extract_data(call, call->reply[2],
- call->count, true);
- if (ret < 0)
- return ret;
- }
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
p = call->reply[2];
p[call->count] = 0;
_debug("volname '%s'", p);
-
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
- /* extract the volume name padding */
- if ((call->count & 3) == 0) {
- call->unmarshall++;
- goto no_volname_padding;
- }
- call->count = 4 - (call->count & 3);
-
- case 4:
- ret = afs_extract_data(call, call->buffer,
- call->count, true);
- if (ret < 0)
- return ret;
-
- call->offset = 0;
- call->unmarshall++;
- no_volname_padding:
-
/* extract the offline message length */
- case 5:
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ case 4:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("offline msg length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_offline_msg_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
call->unmarshall++;
/* extract the offline message */
- case 6:
+ case 5:
_debug("extract offline");
- if (call->count > 0) {
- ret = afs_extract_data(call, call->reply[2],
- call->count, true);
- if (ret < 0)
- return ret;
- }
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
p = call->reply[2];
p[call->count] = 0;
_debug("offline '%s'", p);
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
- /* extract the offline message padding */
- if ((call->count & 3) == 0) {
- call->unmarshall++;
- goto no_offline_padding;
- }
- call->count = 4 - (call->count & 3);
-
- case 7:
- ret = afs_extract_data(call, call->buffer,
- call->count, true);
- if (ret < 0)
- return ret;
-
- call->offset = 0;
- call->unmarshall++;
- no_offline_padding:
-
/* extract the message of the day length */
- case 8:
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ case 6:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->count = ntohl(call->tmp);
_debug("motd length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return afs_protocol_error(call, -EBADMSG);
- call->offset = 0;
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_motd_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
call->unmarshall++;
/* extract the message of the day */
- case 9:
+ case 7:
_debug("extract motd");
- if (call->count > 0) {
- ret = afs_extract_data(call, call->reply[2],
- call->count, true);
- if (ret < 0)
- return ret;
- }
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
p = call->reply[2];
p[call->count] = 0;
_debug("motd '%s'", p);
- call->offset = 0;
call->unmarshall++;
- /* extract the message of the day padding */
- call->count = (4 - (call->count & 3)) & 3;
-
- case 10:
- ret = afs_extract_data(call, call->buffer,
- call->count, false);
- if (ret < 0)
- return ret;
-
- call->offset = 0;
- call->unmarshall++;
- case 11:
+ case 8:
break;
}
@@ -1778,6 +1771,9 @@ int afs_fs_get_volume_status(struct afs_fs_cursor *fc,
__be32 *bp;
void *tmpbuf;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_get_volume_status(fc, vs);
+
_enter("");
tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
@@ -1867,6 +1863,9 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_set_lock(fc, type);
+
_enter("");
call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4);
@@ -1899,6 +1898,9 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_extend_lock(fc);
+
_enter("");
call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4);
@@ -1930,6 +1932,9 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc)
struct afs_net *net = afs_v2net(vnode);
__be32 *bp;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_release_lock(fc);
+
_enter("");
call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4);
@@ -2004,19 +2009,16 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
u32 count;
int ret;
- _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+ _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter));
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* Extract the capabilities word count */
case 1:
- ret = afs_extract_data(call, &call->tmp,
- 1 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -2024,24 +2026,17 @@ again:
call->count = count;
call->count2 = count;
- call->offset = 0;
+ iov_iter_discard(&call->iter, READ, count * sizeof(__be32));
call->unmarshall++;
/* Extract capabilities words */
case 2:
- count = min(call->count, 16U);
- ret = afs_extract_data(call, call->buffer,
- count * sizeof(__be32),
- call->count > 16);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
/* TODO: Examine capabilities */
- call->count -= count;
- if (call->count > 0)
- goto again;
- call->offset = 0;
call->unmarshall++;
break;
}
@@ -2050,6 +2045,14 @@ again:
return 0;
}
+static void afs_destroy_fs_get_capabilities(struct afs_call *call)
+{
+ struct afs_server *server = call->reply[0];
+
+ afs_put_server(call->net, server);
+ afs_flat_call_destructor(call);
+}
+
/*
* FS.GetCapabilities operation type
*/
@@ -2057,7 +2060,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
.name = "FS.GetCapabilities",
.op = afs_FS_GetCapabilities,
.deliver = afs_deliver_fs_get_capabilities,
- .destructor = afs_flat_call_destructor,
+ .done = afs_fileserver_probe_result,
+ .destructor = afs_destroy_fs_get_capabilities,
};
/*
@@ -2067,7 +2071,9 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
int afs_fs_get_capabilities(struct afs_net *net,
struct afs_server *server,
struct afs_addr_cursor *ac,
- struct key *key)
+ struct key *key,
+ unsigned int server_index,
+ bool async)
{
struct afs_call *call;
__be32 *bp;
@@ -2079,6 +2085,10 @@ int afs_fs_get_capabilities(struct afs_net *net,
return -ENOMEM;
call->key = key;
+ call->reply[0] = afs_get_server(server);
+ call->reply[1] = (void *)(long)server_index;
+ call->upgrade = true;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -2086,7 +2096,7 @@ int afs_fs_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_fs_call(call, NULL);
- return afs_make_call(ac, call, GFP_NOFS, false);
+ return afs_make_call(ac, call, GFP_NOFS, async);
}
/*
@@ -2097,7 +2107,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
struct afs_file_status *status = call->reply[1];
struct afs_callback *callback = call->reply[2];
struct afs_volsync *volsync = call->reply[3];
- struct afs_vnode *vnode = call->reply[0];
+ struct afs_fid *fid = call->reply[0];
const __be32 *bp;
int ret;
@@ -2105,21 +2115,16 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
if (ret < 0)
return ret;
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", fid->vid, fid->vnode);
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- afs_decode_status(call, &bp, status, vnode,
- &call->expected_version, NULL);
- callback[call->count].version = ntohl(bp[0]);
- callback[call->count].expiry = ntohl(bp[1]);
- callback[call->count].type = ntohl(bp[2]);
- if (vnode)
- xdr_decode_AFSCallBack(call, vnode, &bp);
- else
- bp += 3;
- if (volsync)
- xdr_decode_AFSVolSync(&bp, volsync);
+ ret = afs_decode_status(call, &bp, status, NULL,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_AFSCallBack_raw(call, &bp, callback);
+ xdr_decode_AFSVolSync(&bp, volsync);
_leave(" = 0 [done]");
return 0;
@@ -2148,7 +2153,10 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
struct afs_call *call;
__be32 *bp;
- _enter(",%x,{%x:%u},,",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_fetch_status(fc, net, fid, status, callback, volsync);
+
+ _enter(",%x,{%llx:%llu},,",
key_serial(fc->key), fid->vid, fid->vnode);
call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
@@ -2158,11 +2166,12 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
}
call->key = fc->key;
- call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[0] = fid;
call->reply[1] = status;
call->reply[2] = callback;
call->reply[3] = volsync;
call->expected_version = 1; /* vnode->status.data_version */
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -2193,38 +2202,40 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* Extract the file status count and array in two steps */
case 1:
_debug("extract status count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
tmp = ntohl(call->tmp);
_debug("status count: %u/%u", tmp, call->count2);
if (tmp != call->count2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_count);
call->count = 0;
call->unmarshall++;
more_counts:
- call->offset = 0;
+ afs_extract_to_buf(call, 21 * sizeof(__be32));
case 2:
_debug("extract status array %u", call->count);
- ret = afs_extract_data(call, call->buffer, 21 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
bp = call->buffer;
statuses = call->reply[1];
- if (afs_decode_status(call, &bp, &statuses[call->count],
- call->count == 0 ? vnode : NULL,
- NULL, NULL) < 0)
- return afs_protocol_error(call, -EBADMSG);
+ ret = afs_decode_status(call, &bp, &statuses[call->count],
+ call->count == 0 ? vnode : NULL,
+ NULL, NULL);
+ if (ret < 0)
+ return ret;
call->count++;
if (call->count < call->count2)
@@ -2232,27 +2243,28 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
call->count = 0;
call->unmarshall++;
- call->offset = 0;
+ afs_extract_to_tmp(call);
/* Extract the callback count and array in two steps */
case 3:
_debug("extract CB count");
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
tmp = ntohl(call->tmp);
_debug("CB count: %u", tmp);
if (tmp != call->count2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_cb_count);
call->count = 0;
call->unmarshall++;
more_cbs:
- call->offset = 0;
+ afs_extract_to_buf(call, 3 * sizeof(__be32));
case 4:
_debug("extract CB array");
- ret = afs_extract_data(call, call->buffer, 3 * 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -2260,7 +2272,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
bp = call->buffer;
callbacks = call->reply[2];
callbacks[call->count].version = ntohl(bp[0]);
- callbacks[call->count].expiry = ntohl(bp[1]);
+ callbacks[call->count].expires_at = xdr_decode_expiry(call, ntohl(bp[1]));
callbacks[call->count].type = ntohl(bp[2]);
statuses = call->reply[1];
if (call->count == 0 && vnode && statuses[0].abort_code == 0)
@@ -2269,19 +2281,17 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
if (call->count < call->count2)
goto more_cbs;
- call->offset = 0;
+ afs_extract_to_buf(call, 6 * sizeof(__be32));
call->unmarshall++;
case 5:
- ret = afs_extract_data(call, call->buffer, 6 * 4, false);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
bp = call->buffer;
- if (call->reply[3])
- xdr_decode_AFSVolSync(&bp, call->reply[3]);
+ xdr_decode_AFSVolSync(&bp, call->reply[3]);
- call->offset = 0;
call->unmarshall++;
case 6:
@@ -2317,7 +2327,11 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
__be32 *bp;
int i;
- _enter(",%x,{%x:%u},%u",
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
+ return yfs_fs_inline_bulk_status(fc, net, fids, statuses, callbacks,
+ nr_fids, volsync);
+
+ _enter(",%x,{%llx:%llu},%u",
key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus,
@@ -2334,6 +2348,7 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
call->reply[2] = callbacks;
call->reply[3] = volsync;
call->count2 = nr_fids;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 479b7fdda124..4c6d8e1112c2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -82,7 +82,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key)
default:
printk("kAFS: AFS vnode with undefined type\n");
read_sequnlock_excl(&vnode->cb_lock);
- return afs_protocol_error(NULL, -EBADMSG);
+ return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type);
}
inode->i_blocks = 0;
@@ -100,7 +100,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
struct afs_fs_cursor fc;
int ret;
- _enter("%s,{%x:%u.%u,S=%lx}",
+ _enter("%s,{%llx:%llu.%u,S=%lx}",
vnode->volume->name,
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
vnode->flags);
@@ -127,9 +127,9 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
int afs_iget5_test(struct inode *inode, void *opaque)
{
struct afs_iget_data *data = opaque;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
- return inode->i_ino == data->fid.vnode &&
- inode->i_generation == data->fid.unique;
+ return memcmp(&vnode->fid, &data->fid, sizeof(data->fid)) == 0;
}
/*
@@ -150,11 +150,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
struct afs_iget_data *data = opaque;
struct afs_vnode *vnode = AFS_FS_I(inode);
- inode->i_ino = data->fid.vnode;
- inode->i_generation = data->fid.unique;
vnode->fid = data->fid;
vnode->volume = data->volume;
+ /* YFS supports 96-bit vnode IDs, but Linux only supports
+ * 64-bit inode numbers.
+ */
+ inode->i_ino = data->fid.vnode;
+ inode->i_generation = data->fid.unique;
return 0;
}
@@ -193,7 +196,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
return ERR_PTR(-ENOMEM);
}
- _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+ _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }",
inode, inode->i_ino, data.fid.vid, data.fid.vnode,
data.fid.unique);
@@ -252,8 +255,8 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
key.vnode_id = vnode->fid.vnode;
key.unique = vnode->fid.unique;
- key.vnode_id_ext[0] = 0;
- key.vnode_id_ext[1] = 0;
+ key.vnode_id_ext[0] = vnode->fid.vnode >> 32;
+ key.vnode_id_ext[1] = vnode->fid.vnode_hi;
aux.data_version = vnode->status.data_version;
vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
@@ -277,7 +280,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
struct inode *inode;
int ret;
- _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
+ _enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique);
as = sb->s_fs_info;
data.volume = as->volume;
@@ -289,7 +292,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
return ERR_PTR(-ENOMEM);
}
- _debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+ _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }",
inode, fid->vid, fid->vnode, fid->unique);
vnode = AFS_FS_I(inode);
@@ -314,11 +317,11 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
* didn't give us a callback) */
vnode->cb_version = 0;
vnode->cb_type = 0;
- vnode->cb_expires_at = 0;
+ vnode->cb_expires_at = ktime_get();
} else {
vnode->cb_version = cb->version;
vnode->cb_type = cb->type;
- vnode->cb_expires_at = cb->expiry;
+ vnode->cb_expires_at = cb->expires_at;
vnode->cb_interest = afs_get_cb_interest(cbi);
set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
}
@@ -352,7 +355,7 @@ bad_inode:
*/
void afs_zap_data(struct afs_vnode *vnode)
{
- _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
#ifdef CONFIG_AFS_FSCACHE
fscache_invalidate(vnode->cache);
@@ -382,7 +385,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
bool valid = false;
int ret;
- _enter("{v={%x:%u} fl=%lx},%x",
+ _enter("{v={%llx:%llu} fl=%lx},%x",
vnode->fid.vid, vnode->fid.vnode, vnode->flags,
key_serial(key));
@@ -501,7 +504,7 @@ void afs_evict_inode(struct inode *inode)
vnode = AFS_FS_I(inode);
- _enter("{%x:%u.%d}",
+ _enter("{%llx:%llu.%d}",
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique);
@@ -550,7 +553,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
struct key *key;
int ret;
- _enter("{%x:%u},{n=%pd},%x",
+ _enter("{%llx:%llu},{n=%pd},%x",
vnode->fid.vid, vnode->fid.vnode, dentry,
attr->ia_valid);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 871a228d7f37..5da3b09b7518 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -22,6 +22,7 @@
#include <linux/backing-dev.h>
#include <linux/uuid.h>
#include <linux/mm_types.h>
+#include <linux/dns_resolver.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -73,12 +74,17 @@ struct afs_addr_list {
struct rcu_head rcu; /* Must be first */
refcount_t usage;
u32 version; /* Version */
- unsigned short nr_addrs;
- unsigned short index; /* Address currently in use */
- unsigned short nr_ipv4; /* Number of IPv4 addresses */
+ unsigned char max_addrs;
+ unsigned char nr_addrs;
+ unsigned char preferred; /* Preferred address */
+ unsigned char nr_ipv4; /* Number of IPv4 addresses */
+ enum dns_record_source source:8;
+ enum dns_lookup_status status:8;
unsigned long probed; /* Mask of servers that have been probed */
- unsigned long yfs; /* Mask of servers that are YFS */
+ unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long responded; /* Mask of addrs that responded */
struct sockaddr_rxrpc addrs[];
+#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
/*
@@ -86,6 +92,7 @@ struct afs_addr_list {
*/
struct afs_call {
const struct afs_call_type *type; /* type of call */
+ struct afs_addr_list *alist; /* Address is alist[addr_ix] */
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
@@ -96,16 +103,22 @@ struct afs_call {
struct afs_cb_interest *cbi; /* Callback interest for server used */
void *request; /* request data (first part) */
struct address_space *mapping; /* Pages being written from */
+ struct iov_iter iter; /* Buffer iterator */
+ struct iov_iter *_iter; /* Iterator currently in use */
+ union { /* Convenience for ->iter */
+ struct kvec kvec[1];
+ struct bio_vec bvec[1];
+ };
void *buffer; /* reply receive buffer */
void *reply[4]; /* Where to put the reply */
pgoff_t first; /* first page in mapping to deal with */
pgoff_t last; /* last page in mapping to deal with */
- size_t offset; /* offset into received data store */
atomic_t usage;
enum afs_call_state state;
spinlock_t state_lock;
int error; /* error code */
u32 abort_code; /* Remote abort ID or 0 */
+ u32 epoch;
unsigned request_size; /* size of request data */
unsigned reply_max; /* maximum size of reply */
unsigned first_offset; /* offset into mapping[first] */
@@ -115,19 +128,28 @@ struct afs_call {
unsigned count2; /* count used in unmarshalling */
};
unsigned char unmarshall; /* unmarshalling phase */
+ unsigned char addr_ix; /* Address in ->alist */
bool incoming; /* T if incoming call */
bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool ret_reply0; /* T if should return reply[0] on success */
bool upgrade; /* T to request service upgrade */
+ bool want_reply_time; /* T if want reply_time */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
- __be32 tmp; /* place to extract temporary data */
+ union { /* place to extract temporary data */
+ struct {
+ __be32 tmp_u;
+ __be32 tmp;
+ } __attribute__((packed));
+ __be64 tmp64;
+ };
afs_dataversion_t expected_version; /* Updated version expected from store */
afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */
+ ktime_t reply_time; /* Time of first reply packet */
};
struct afs_call_type {
@@ -144,6 +166,9 @@ struct afs_call_type {
/* Work function */
void (*work)(struct work_struct *work);
+
+ /* Call done function (gets called immediately on success or failure) */
+ void (*done)(struct afs_call *call);
};
/*
@@ -183,6 +208,7 @@ struct afs_read {
refcount_t usage;
unsigned int index; /* Which page we're reading into */
unsigned int nr_pages;
+ unsigned int offset; /* offset into current page */
void (*page_done)(struct afs_call *, struct afs_read *);
struct page **pages;
struct page *array[];
@@ -242,7 +268,7 @@ struct afs_net {
seqlock_t cells_lock;
struct mutex proc_cells_lock;
- struct list_head proc_cells;
+ struct hlist_head proc_cells;
/* Known servers. Theoretically each fileserver can only be in one
* cell, but in practice, people create aliases and subsets and there's
@@ -320,7 +346,7 @@ struct afs_cell {
struct afs_net *net;
struct key *anonymous_key; /* anonymous user key for this cell */
struct work_struct manager; /* Manager for init/deinit/dns */
- struct list_head proc_link; /* /proc cell list link */
+ struct hlist_node proc_link; /* /proc cell list link */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_cookie *cache; /* caching cookie */
#endif
@@ -341,13 +367,70 @@ struct afs_cell {
rwlock_t proc_lock;
/* VL server list. */
- rwlock_t vl_addrs_lock; /* Lock on vl_addrs */
- struct afs_addr_list __rcu *vl_addrs; /* List of VL servers */
+ rwlock_t vl_servers_lock; /* Lock on vl_servers */
+ struct afs_vlserver_list __rcu *vl_servers;
+
u8 name_len; /* Length of name */
char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */
};
/*
+ * Volume Location server record.
+ */
+struct afs_vlserver {
+ struct rcu_head rcu;
+ struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */
+ unsigned long flags;
+#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */
+#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */
+#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
+ rwlock_t lock; /* Lock on addresses */
+ atomic_t usage;
+
+ /* Probe state */
+ wait_queue_head_t probe_wq;
+ atomic_t probe_outstanding;
+ spinlock_t probe_lock;
+ struct {
+ unsigned int rtt; /* RTT as ktime/64 */
+ u32 abort_code;
+ short error;
+ bool have_result;
+ bool responded:1;
+ bool is_yfs:1;
+ bool not_yfs:1;
+ bool local_failure:1;
+ } probe;
+
+ u16 port;
+ u16 name_len; /* Length of name */
+ char name[]; /* Server name, case-flattened */
+};
+
+/*
+ * Weighted list of Volume Location servers.
+ */
+struct afs_vlserver_entry {
+ u16 priority; /* Preference (as SRV) */
+ u16 weight; /* Weight (as SRV) */
+ enum dns_record_source source:8;
+ enum dns_lookup_status status:8;
+ struct afs_vlserver *server;
+};
+
+struct afs_vlserver_list {
+ struct rcu_head rcu;
+ atomic_t usage;
+ u8 nr_servers;
+ u8 index; /* Server currently in use */
+ u8 preferred; /* Preferred server */
+ enum dns_record_source source:8;
+ enum dns_lookup_status status:8;
+ rwlock_t lock;
+ struct afs_vlserver_entry servers[];
+};
+
+/*
* Cached VLDB entry.
*
* This is pointed to by cell->vldb_entries, indexed by name.
@@ -401,8 +484,12 @@ struct afs_server {
#define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */
#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
+#define AFS_SERVER_FL_IS_YFS 9 /* Server is YFS not AFS */
+#define AFS_SERVER_FL_NO_RM2 10 /* Fileserver doesn't support YFS.RemoveFile2 */
+#define AFS_SERVER_FL_HAVE_EPOCH 11 /* ->epoch is valid */
atomic_t usage;
u32 addr_version; /* Address list version */
+ u32 cm_epoch; /* Server RxRPC epoch */
/* file service access */
rwlock_t fs_lock; /* access lock */
@@ -411,6 +498,26 @@ struct afs_server {
struct hlist_head cb_volumes; /* List of volume interests on this server */
unsigned cb_s_break; /* Break-everything counter. */
rwlock_t cb_break_lock; /* Volume finding lock */
+
+ /* Probe state */
+ wait_queue_head_t probe_wq;
+ atomic_t probe_outstanding;
+ spinlock_t probe_lock;
+ struct {
+ unsigned int rtt; /* RTT as ktime/64 */
+ u32 abort_code;
+ u32 cm_epoch;
+ short error;
+ bool have_result;
+ bool responded:1;
+ bool is_yfs:1;
+ bool not_yfs:1;
+ bool local_failure:1;
+ bool no_epoch:1;
+ bool cm_probed:1;
+ bool said_rebooted:1;
+ bool said_inconsistent:1;
+ } probe;
};
/*
@@ -445,8 +552,8 @@ struct afs_server_entry {
struct afs_server_list {
refcount_t usage;
- unsigned short nr_servers;
- unsigned short index; /* Server currently in use */
+ unsigned char nr_servers;
+ unsigned char preferred; /* Preferred server */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
rwlock_t lock;
@@ -548,6 +655,15 @@ struct afs_vnode {
afs_callback_type_t cb_type; /* type of callback */
};
+static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ return vnode->cache;
+#else
+ return NULL;
+#endif
+}
+
/*
* cached security record for one user's attempt to access a vnode
*/
@@ -584,13 +700,31 @@ struct afs_interface {
*/
struct afs_addr_cursor {
struct afs_addr_list *alist; /* Current address list (pins ref) */
- struct sockaddr_rxrpc *addr;
+ unsigned long tried; /* Tried addresses */
+ signed char index; /* Current address */
+ bool responded; /* T if the current address responded */
+ unsigned short nr_iterations; /* Number of address iterations */
+ short error;
u32 abort_code;
- unsigned short start; /* Starting point in alist->addrs[] */
- unsigned short index; /* Wrapping offset from start to current addr */
+};
+
+/*
+ * Cursor for iterating over a set of volume location servers.
+ */
+struct afs_vl_cursor {
+ struct afs_addr_cursor ac;
+ struct afs_cell *cell; /* The cell we're querying */
+ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */
+ struct afs_vlserver *server; /* Server on which this resides */
+ struct key *key; /* Key for the server */
+ unsigned long untried; /* Bitmask of untried servers */
+ short index; /* Current server */
short error;
- bool begun; /* T if we've begun iteration */
- bool responded; /* T if the current address responded */
+ unsigned short flags;
+#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */
+#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */
+#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */
+ unsigned short nr_iterations; /* Number of server iterations */
};
/*
@@ -602,10 +736,11 @@ struct afs_fs_cursor {
struct afs_server_list *server_list; /* Current server list (pins ref) */
struct afs_cb_interest *cbi; /* Server on which this resides (pins ref) */
struct key *key; /* Key for the server */
+ unsigned long untried; /* Bitmask of untried servers */
unsigned int cb_break; /* cb_break + cb_s_break before the call */
unsigned int cb_break_2; /* cb_break + cb_s_break (2nd vnode) */
- unsigned char start; /* Initial index in server list */
- unsigned char index; /* Number of servers tried beyond start */
+ short index; /* Current server */
+ short error;
unsigned short flags;
#define AFS_FS_CURSOR_STOP 0x0001 /* Set to cease iteration */
#define AFS_FS_CURSOR_VBUSY 0x0002 /* Set if seen VBUSY */
@@ -613,6 +748,7 @@ struct afs_fs_cursor {
#define AFS_FS_CURSOR_VNOVOL 0x0008 /* Set if seen VNOVOL */
#define AFS_FS_CURSOR_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */
#define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */
+ unsigned short nr_iterations; /* Number of server iterations */
};
/*
@@ -638,12 +774,12 @@ extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
unsigned short,
unsigned short);
extern void afs_put_addrlist(struct afs_addr_list *);
-extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char,
- unsigned short, unsigned short);
-extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *);
+extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+ const char *, size_t, char,
+ unsigned short, unsigned short);
+extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
extern bool afs_iterate_addresses(struct afs_addr_cursor *);
extern int afs_end_cursor(struct afs_addr_cursor *);
-extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *);
extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
@@ -666,6 +802,7 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
* callback.c
*/
extern void afs_init_callback_state(struct afs_server *);
+extern void __afs_break_callback(struct afs_vnode *);
extern void afs_break_callback(struct afs_vnode *);
extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*);
@@ -686,10 +823,13 @@ static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
}
-static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode,
- struct afs_cb_interest *cbi)
+static inline bool afs_cb_is_broken(unsigned int cb_break,
+ const struct afs_vnode *vnode,
+ const struct afs_cb_interest *cbi)
{
- return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break;
+ return !cbi || cb_break != (vnode->cb_break +
+ cbi->server->cb_s_break +
+ vnode->volume->cb_v_break);
}
/*
@@ -779,7 +919,7 @@ extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64,
struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64);
+extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64);
extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
struct afs_fid *, struct afs_file_status *);
@@ -795,7 +935,7 @@ extern int afs_fs_release_lock(struct afs_fs_cursor *);
extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
struct afs_addr_cursor *, struct key *);
extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
+ struct afs_addr_cursor *, struct key *, unsigned int, bool);
extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
struct afs_fid *, struct afs_file_status *,
struct afs_callback *, unsigned int,
@@ -805,6 +945,13 @@ extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
struct afs_callback *, struct afs_volsync *);
/*
+ * fs_probe.c
+ */
+extern void afs_fileserver_probe_result(struct afs_call *);
+extern int afs_probe_fileservers(struct afs_net *, struct key *, struct afs_server_list *);
+extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
+
+/*
* inode.c
*/
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool);
@@ -920,7 +1067,6 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
-extern int afs_queue_call_work(struct afs_call *);
extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
const struct afs_call_type *,
@@ -928,12 +1074,39 @@ extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
extern void afs_flat_call_destructor(struct afs_call *);
extern void afs_send_empty_reply(struct afs_call *);
extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
-extern int afs_extract_data(struct afs_call *, void *, size_t, bool);
-extern int afs_protocol_error(struct afs_call *, int);
+extern int afs_extract_data(struct afs_call *, bool);
+extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause);
+
+static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
+{
+ call->kvec[0].iov_base = buf;
+ call->kvec[0].iov_len = size;
+ iov_iter_kvec(&call->iter, READ, call->kvec, 1, size);
+}
+
+static inline void afs_extract_to_tmp(struct afs_call *call)
+{
+ afs_extract_begin(call, &call->tmp, sizeof(call->tmp));
+}
+
+static inline void afs_extract_to_tmp64(struct afs_call *call)
+{
+ afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64));
+}
+
+static inline void afs_extract_discard(struct afs_call *call, size_t size)
+{
+ iov_iter_discard(&call->iter, READ, size);
+}
+
+static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
+{
+ afs_extract_begin(call, call->buffer, size);
+}
static inline int afs_transfer_reply(struct afs_call *call)
{
- return afs_extract_data(call, call->buffer, call->reply_max, false);
+ return afs_extract_data(call, false);
}
static inline bool afs_check_call_state(struct afs_call *call,
@@ -1010,7 +1183,6 @@ extern void afs_put_server(struct afs_net *, struct afs_server *);
extern void afs_manage_servers(struct work_struct *);
extern void afs_servers_timer(struct timer_list *);
extern void __net_exit afs_purge_servers(struct afs_net *);
-extern bool afs_probe_fileserver(struct afs_fs_cursor *);
extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *);
/*
@@ -1037,14 +1209,51 @@ extern void afs_fs_exit(void);
/*
* vlclient.c
*/
-extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *,
- struct afs_addr_cursor *,
- struct key *, const char *, int);
-extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *,
- struct key *, const uuid_t *);
-extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *);
-extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *,
- struct key *, const uuid_t *);
+extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *,
+ const char *, int);
+extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *);
+extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *,
+ struct afs_vlserver *, unsigned int, bool);
+extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *);
+
+/*
+ * vl_probe.c
+ */
+extern void afs_vlserver_probe_result(struct afs_call *);
+extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *);
+extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long);
+
+/*
+ * vl_rotate.c
+ */
+extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *,
+ struct afs_cell *, struct key *);
+extern bool afs_select_vlserver(struct afs_vl_cursor *);
+extern bool afs_select_current_vlserver(struct afs_vl_cursor *);
+extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
+
+/*
+ * vlserver_list.c
+ */
+static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
+{
+ atomic_inc(&vlserver->usage);
+ return vlserver;
+}
+
+static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
+{
+ if (vllist)
+ atomic_inc(&vllist->usage);
+ return vllist;
+}
+
+extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short);
+extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *);
+extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int);
+extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *);
+extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
+ const void *, size_t);
/*
* volume.c
@@ -1087,6 +1296,36 @@ extern int afs_launder_page(struct page *);
extern const struct xattr_handler *afs_xattr_handlers[];
extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
+/*
+ * yfsclient.c
+ */
+extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool);
+extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
+extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, u64,
+ struct afs_fid *, struct afs_file_status *, struct afs_callback *);
+extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, u64,
+ struct afs_fid *, struct afs_file_status *, struct afs_callback *);
+extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
+extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64);
+extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
+extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
+ struct afs_fid *, struct afs_file_status *);
+extern int yfs_fs_rename(struct afs_fs_cursor *, const char *,
+ struct afs_vnode *, const char *, u64, u64);
+extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
+ pgoff_t, pgoff_t, unsigned, unsigned);
+extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
+extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
+extern int yfs_fs_extend_lock(struct afs_fs_cursor *);
+extern int yfs_fs_release_lock(struct afs_fs_cursor *);
+extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, struct afs_volsync *);
+extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, unsigned int,
+ struct afs_volsync *);
/*
* Miscellaneous inline functions.
@@ -1118,6 +1357,17 @@ static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc,
}
}
+static inline int afs_io_error(struct afs_call *call, enum afs_io_error where)
+{
+ trace_afs_io_error(call->debug_id, -EIO, where);
+ return -EIO;
+}
+
+static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
+{
+ trace_afs_file_error(vnode, -EIO, where);
+ return -EIO;
+}
/*****************************************************************************/
/*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index e84fe822a960..107427688edd 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -87,7 +87,7 @@ static int __net_init afs_net_init(struct net *net_ns)
timer_setup(&net->cells_timer, afs_cells_timer, 0);
mutex_init(&net->proc_cells_lock);
- INIT_LIST_HEAD(&net->proc_cells);
+ INIT_HLIST_HEAD(&net->proc_cells);
seqlock_init(&net->fs_lock);
net->fs_servers = RB_ROOT;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 99fd13500a97..2e51c6994148 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -130,9 +130,10 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
goto error_no_page;
}
- ret = -EIO;
- if (PageError(page))
+ if (PageError(page)) {
+ ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
goto error;
+ }
buf = kmap_atomic(page);
memcpy(devname, buf, size);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 0c3285c8db95..be2ee3bbd0a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -17,6 +17,11 @@
#include <linux/uaccess.h>
#include "internal.h"
+struct afs_vl_seq_net_private {
+ struct seq_net_private seq; /* Must be first */
+ struct afs_vlserver_list *vllist;
+};
+
static inline struct afs_net *afs_seq2net(struct seq_file *m)
{
return afs_net(seq_file_net(m));
@@ -32,17 +37,24 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m)
*/
static int afs_proc_cells_show(struct seq_file *m, void *v)
{
- struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
- struct afs_net *net = afs_seq2net(m);
+ struct afs_vlserver_list *vllist;
+ struct afs_cell *cell;
- if (v == &net->proc_cells) {
+ if (v == SEQ_START_TOKEN) {
/* display header on line 1 */
- seq_puts(m, "USE NAME\n");
+ seq_puts(m, "USE TTL SV NAME\n");
return 0;
}
+ cell = list_entry(v, struct afs_cell, proc_link);
+ vllist = rcu_dereference(cell->vl_servers);
+
/* display one cell per line on subsequent lines */
- seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name);
+ seq_printf(m, "%3u %6lld %2u %s\n",
+ atomic_read(&cell->usage),
+ cell->dns_expiry - ktime_get_real_seconds(),
+ vllist ? vllist->nr_servers : 0,
+ cell->name);
return 0;
}
@@ -50,12 +62,12 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
__acquires(rcu)
{
rcu_read_lock();
- return seq_list_start_head(&afs_seq2net(m)->proc_cells, *_pos);
+ return seq_hlist_start_head_rcu(&afs_seq2net(m)->proc_cells, *_pos);
}
static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &afs_seq2net(m)->proc_cells, pos);
+ return seq_hlist_next_rcu(v, &afs_seq2net(m)->proc_cells, pos);
}
static void afs_proc_cells_stop(struct seq_file *m, void *v)
@@ -98,13 +110,13 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
goto inval;
args = strchr(name, ' ');
- if (!args)
- goto inval;
- do {
- *args++ = 0;
- } while(*args == ' ');
- if (!*args)
- goto inval;
+ if (args) {
+ do {
+ *args++ = 0;
+ } while(*args == ' ');
+ if (!*args)
+ goto inval;
+ }
/* determine command to perform */
_debug("cmd=%s name=%s args=%s", buf, name, args);
@@ -120,7 +132,6 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
afs_put_cell(net, cell);
- printk("kAFS: Added new cell '%s'\n", name);
} else {
goto inval;
}
@@ -210,7 +221,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
return 0;
}
- seq_printf(m, "%3d %08x %s\n",
+ seq_printf(m, "%3d %08llx %s\n",
atomic_read(&vol->usage), vol->vid,
afs_vol_types[vol->type]);
@@ -249,61 +260,102 @@ static const struct seq_operations afs_proc_cell_volumes_ops = {
.show = afs_proc_cell_volumes_show,
};
+static const char *const dns_record_sources[NR__dns_record_source + 1] = {
+ [DNS_RECORD_UNAVAILABLE] = "unav",
+ [DNS_RECORD_FROM_CONFIG] = "cfg",
+ [DNS_RECORD_FROM_DNS_A] = "A",
+ [DNS_RECORD_FROM_DNS_AFSDB] = "AFSDB",
+ [DNS_RECORD_FROM_DNS_SRV] = "SRV",
+ [DNS_RECORD_FROM_NSS] = "nss",
+ [NR__dns_record_source] = "[weird]"
+};
+
+static const char *const dns_lookup_statuses[NR__dns_lookup_status + 1] = {
+ [DNS_LOOKUP_NOT_DONE] = "no-lookup",
+ [DNS_LOOKUP_GOOD] = "good",
+ [DNS_LOOKUP_GOOD_WITH_BAD] = "good/bad",
+ [DNS_LOOKUP_BAD] = "bad",
+ [DNS_LOOKUP_GOT_NOT_FOUND] = "not-found",
+ [DNS_LOOKUP_GOT_LOCAL_FAILURE] = "local-failure",
+ [DNS_LOOKUP_GOT_TEMP_FAILURE] = "temp-failure",
+ [DNS_LOOKUP_GOT_NS_FAILURE] = "ns-failure",
+ [NR__dns_lookup_status] = "[weird]"
+};
+
/*
* Display the list of Volume Location servers we're using for a cell.
*/
static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
{
- struct sockaddr_rxrpc *addr = v;
+ const struct afs_vl_seq_net_private *priv = m->private;
+ const struct afs_vlserver_list *vllist = priv->vllist;
+ const struct afs_vlserver_entry *entry;
+ const struct afs_vlserver *vlserver;
+ const struct afs_addr_list *alist;
+ int i;
- /* display header on line 1 */
- if (v == (void *)1) {
- seq_puts(m, "ADDRESS\n");
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, "# source %s, status %s\n",
+ dns_record_sources[vllist->source],
+ dns_lookup_statuses[vllist->status]);
return 0;
}
- /* display one cell per line on subsequent lines */
- seq_printf(m, "%pISp\n", &addr->transport);
+ entry = v;
+ vlserver = entry->server;
+ alist = rcu_dereference(vlserver->addresses);
+
+ seq_printf(m, "%s [p=%hu w=%hu s=%s,%s]:\n",
+ vlserver->name, entry->priority, entry->weight,
+ dns_record_sources[alist ? alist->source : entry->source],
+ dns_lookup_statuses[alist ? alist->status : entry->status]);
+ if (alist) {
+ for (i = 0; i < alist->nr_addrs; i++)
+ seq_printf(m, " %c %pISpc\n",
+ alist->preferred == i ? '>' : '-',
+ &alist->addrs[i].transport);
+ }
return 0;
}
static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
__acquires(rcu)
{
- struct afs_addr_list *alist;
+ struct afs_vl_seq_net_private *priv = m->private;
+ struct afs_vlserver_list *vllist;
struct afs_cell *cell = PDE_DATA(file_inode(m->file));
loff_t pos = *_pos;
rcu_read_lock();
- alist = rcu_dereference(cell->vl_addrs);
+ vllist = rcu_dereference(cell->vl_servers);
+ priv->vllist = vllist;
- /* allow for the header line */
- if (!pos)
- return (void *) 1;
- pos--;
+ if (pos < 0)
+ *_pos = pos = 0;
+ if (pos == 0)
+ return SEQ_START_TOKEN;
- if (!alist || pos >= alist->nr_addrs)
+ if (!vllist || pos - 1 >= vllist->nr_servers)
return NULL;
- return alist->addrs + pos;
+ return &vllist->servers[pos - 1];
}
static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v,
loff_t *_pos)
{
- struct afs_addr_list *alist;
- struct afs_cell *cell = PDE_DATA(file_inode(m->file));
+ struct afs_vl_seq_net_private *priv = m->private;
+ struct afs_vlserver_list *vllist = priv->vllist;
loff_t pos;
- alist = rcu_dereference(cell->vl_addrs);
-
pos = *_pos;
- (*_pos)++;
- if (!alist || pos >= alist->nr_addrs)
+ pos++;
+ *_pos = pos;
+ if (!vllist || pos - 1 >= vllist->nr_servers)
return NULL;
- return alist->addrs + pos;
+ return &vllist->servers[pos - 1];
}
static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v)
@@ -339,11 +391,11 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
&server->uuid,
atomic_read(&server->usage),
&alist->addrs[0].transport,
- alist->index == 0 ? "*" : "");
+ alist->preferred == 0 ? "*" : "");
for (i = 1; i < alist->nr_addrs; i++)
seq_printf(m, " %pISpc%s\n",
&alist->addrs[i].transport,
- alist->index == i ? "*" : "");
+ alist->preferred == i ? "*" : "");
return 0;
}
@@ -564,7 +616,7 @@ int afs_proc_cell_setup(struct afs_cell *cell)
if (!proc_create_net_data("vlservers", 0444, dir,
&afs_proc_cell_vlservers_ops,
- sizeof(struct seq_net_private),
+ sizeof(struct afs_vl_seq_net_private),
cell) ||
!proc_create_net_data("volumes", 0444, dir,
&afs_proc_cell_volumes_ops,
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
new file mode 100644
index 000000000000..07bc10f076aa
--- /dev/null
+++ b/fs/afs/protocol_yfs.h
@@ -0,0 +1,163 @@
+/* YFS protocol bits
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define YFS_FS_SERVICE 2500
+#define YFS_CM_SERVICE 2501
+
+#define YFSCBMAX 1024
+
+enum YFS_CM_Operations {
+ YFSCBProbe = 206, /* probe client */
+ YFSCBGetLock = 207, /* get contents of CM lock table */
+ YFSCBXStatsVersion = 209, /* get version of extended statistics */
+ YFSCBGetXStats = 210, /* get contents of extended statistics data */
+ YFSCBInitCallBackState3 = 213, /* initialise callback state, version 3 */
+ YFSCBProbeUuid = 214, /* check the client hasn't rebooted */
+ YFSCBGetServerPrefs = 215,
+ YFSCBGetCellServDV = 216,
+ YFSCBGetLocalCell = 217,
+ YFSCBGetCacheConfig = 218,
+ YFSCBGetCellByNum = 65537,
+ YFSCBTellMeAboutYourself = 65538, /* get client capabilities */
+ YFSCBCallBack = 64204,
+};
+
+enum YFS_FS_Operations {
+ YFSFETCHACL = 64131, /* YFS Fetch file ACL */
+ YFSFETCHSTATUS = 64132, /* YFS Fetch file status */
+ YFSSTOREACL = 64134, /* YFS Store file ACL */
+ YFSSTORESTATUS = 64135, /* YFS Store file status */
+ YFSREMOVEFILE = 64136, /* YFS Remove a file */
+ YFSCREATEFILE = 64137, /* YFS Create a file */
+ YFSRENAME = 64138, /* YFS Rename or move a file or directory */
+ YFSSYMLINK = 64139, /* YFS Create a symbolic link */
+ YFSLINK = 64140, /* YFS Create a hard link */
+ YFSMAKEDIR = 64141, /* YFS Create a directory */
+ YFSREMOVEDIR = 64142, /* YFS Remove a directory */
+ YFSGETVOLUMESTATUS = 64149, /* YFS Get volume status information */
+ YFSSETVOLUMESTATUS = 64150, /* YFS Set volume status information */
+ YFSSETLOCK = 64156, /* YFS Request a file lock */
+ YFSEXTENDLOCK = 64157, /* YFS Extend a file lock */
+ YFSRELEASELOCK = 64158, /* YFS Release a file lock */
+ YFSLOOKUP = 64161, /* YFS lookup file in directory */
+ YFSFLUSHCPS = 64165,
+ YFSFETCHOPAQUEACL = 64168,
+ YFSWHOAMI = 64170,
+ YFSREMOVEACL = 64171,
+ YFSREMOVEFILE2 = 64173,
+ YFSSTOREOPAQUEACL2 = 64174,
+ YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */
+ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */
+ YFSSTOREDATA64 = 64538, /* YFS Store file data */
+ YFSUPDATESYMLINK = 64540,
+};
+
+struct yfs_xdr_u64 {
+ __be32 msw;
+ __be32 lsw;
+} __packed;
+
+static inline u64 xdr_to_u64(const struct yfs_xdr_u64 x)
+{
+ return ((u64)ntohl(x.msw) << 32) | ntohl(x.lsw);
+}
+
+static inline struct yfs_xdr_u64 u64_to_xdr(const u64 x)
+{
+ return (struct yfs_xdr_u64){ .msw = htonl(x >> 32), .lsw = htonl(x) };
+}
+
+struct yfs_xdr_vnode {
+ struct yfs_xdr_u64 lo;
+ __be32 hi;
+ __be32 unique;
+} __packed;
+
+struct yfs_xdr_YFSFid {
+ struct yfs_xdr_u64 volume;
+ struct yfs_xdr_vnode vnode;
+} __packed;
+
+
+struct yfs_xdr_YFSFetchStatus {
+ __be32 type;
+ __be32 nlink;
+ struct yfs_xdr_u64 size;
+ struct yfs_xdr_u64 data_version;
+ struct yfs_xdr_u64 author;
+ struct yfs_xdr_u64 owner;
+ struct yfs_xdr_u64 group;
+ __be32 mode;
+ __be32 caller_access;
+ __be32 anon_access;
+ struct yfs_xdr_vnode parent;
+ __be32 data_access_protocol;
+ struct yfs_xdr_u64 mtime_client;
+ struct yfs_xdr_u64 mtime_server;
+ __be32 lock_count;
+ __be32 abort_code;
+} __packed;
+
+struct yfs_xdr_YFSCallBack {
+ __be32 version;
+ struct yfs_xdr_u64 expiration_time;
+ __be32 type;
+} __packed;
+
+struct yfs_xdr_YFSStoreStatus {
+ __be32 mask;
+ __be32 mode;
+ struct yfs_xdr_u64 mtime_client;
+ struct yfs_xdr_u64 owner;
+ struct yfs_xdr_u64 group;
+} __packed;
+
+struct yfs_xdr_RPCFlags {
+ __be32 rpc_flags;
+} __packed;
+
+struct yfs_xdr_YFSVolSync {
+ struct yfs_xdr_u64 vol_creation_date;
+ struct yfs_xdr_u64 vol_update_date;
+ struct yfs_xdr_u64 max_quota;
+ struct yfs_xdr_u64 blocks_in_use;
+ struct yfs_xdr_u64 blocks_avail;
+} __packed;
+
+enum yfs_volume_type {
+ yfs_volume_type_ro = 0,
+ yfs_volume_type_rw = 1,
+};
+
+#define yfs_FVSOnline 0x1
+#define yfs_FVSInservice 0x2
+#define yfs_FVSBlessed 0x4
+#define yfs_FVSNeedsSalvage 0x8
+
+struct yfs_xdr_YFSFetchVolumeStatus {
+ struct yfs_xdr_u64 vid;
+ struct yfs_xdr_u64 parent_id;
+ __be32 flags;
+ __be32 type;
+ struct yfs_xdr_u64 max_quota;
+ struct yfs_xdr_u64 blocks_in_use;
+ struct yfs_xdr_u64 part_blocks_avail;
+ struct yfs_xdr_u64 part_max_blocks;
+ struct yfs_xdr_u64 vol_copy_date;
+ struct yfs_xdr_u64 vol_backup_date;
+} __packed;
+
+struct yfs_xdr_YFSStoreVolumeStatus {
+ __be32 mask;
+ struct yfs_xdr_u64 min_quota;
+ struct yfs_xdr_u64 max_quota;
+ struct yfs_xdr_u64 file_quota;
+} __packed;
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 1faef56b12bd..00504254c1c2 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -19,14 +19,6 @@
#include "afs_fs.h"
/*
- * Initialise a filesystem server cursor for iterating over FS servers.
- */
-static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
-{
- memset(fc, 0, sizeof(*fc));
-}
-
-/*
* Begin an operation on the fileserver.
*
* Fileserver operations are serialised on the server by vnode, so we serialise
@@ -35,13 +27,14 @@ static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode
bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
struct key *key)
{
- afs_init_fs_cursor(fc, vnode);
+ memset(fc, 0, sizeof(*fc));
fc->vnode = vnode;
fc->key = key;
fc->ac.error = SHRT_MAX;
+ fc->error = -EDESTADDRREQ;
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- fc->ac.error = -EINTR;
+ fc->error = -EINTR;
fc->flags |= AFS_FS_CURSOR_STOP;
return false;
}
@@ -65,12 +58,15 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
fc->server_list = afs_get_serverlist(vnode->volume->servers);
read_unlock(&vnode->volume->servers_lock);
+ fc->untried = (1UL << fc->server_list->nr_servers) - 1;
+ fc->index = READ_ONCE(fc->server_list->preferred);
+
cbi = vnode->cb_interest;
if (cbi) {
/* See if the vnode's preferred record is still available */
for (i = 0; i < fc->server_list->nr_servers; i++) {
if (fc->server_list->servers[i].cb_interest == cbi) {
- fc->start = i;
+ fc->index = i;
goto found_interest;
}
}
@@ -80,7 +76,7 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
* and have to return an error.
*/
if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
return false;
}
@@ -94,12 +90,9 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
afs_put_cb_interest(afs_v2net(vnode), cbi);
cbi = NULL;
- } else {
- fc->start = READ_ONCE(fc->server_list->index);
}
found_interest:
- fc->index = fc->start;
return true;
}
@@ -117,7 +110,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
default: m = "busy"; break;
}
- pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m);
+ pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
}
/*
@@ -127,7 +120,7 @@ static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
{
msleep_interruptible(1000);
if (signal_pending(current)) {
- fc->ac.error = -ERESTARTSYS;
+ fc->error = -ERESTARTSYS;
return false;
}
@@ -143,27 +136,32 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
struct afs_addr_list *alist;
struct afs_server *server;
struct afs_vnode *vnode = fc->vnode;
+ u32 rtt, abort_code;
+ int error = fc->ac.error, i;
- _enter("%u/%u,%u/%u,%d,%d",
- fc->index, fc->start,
- fc->ac.index, fc->ac.start,
- fc->ac.error, fc->ac.abort_code);
+ _enter("%lx[%d],%lx[%d],%d,%d",
+ fc->untried, fc->index,
+ fc->ac.tried, fc->ac.index,
+ error, fc->ac.abort_code);
if (fc->flags & AFS_FS_CURSOR_STOP) {
_leave(" = f [stopped]");
return false;
}
+ fc->nr_iterations++;
+
/* Evaluate the result of the previous operation, if there was one. */
- switch (fc->ac.error) {
+ switch (error) {
case SHRT_MAX:
goto start;
case 0:
default:
/* Success or local failure. Stop. */
+ fc->error = error;
fc->flags |= AFS_FS_CURSOR_STOP;
- _leave(" = f [okay/local %d]", fc->ac.error);
+ _leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
@@ -178,7 +176,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* - May indicate that the fileserver couldn't attach to the vol.
*/
if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
- fc->ac.error = -EREMOTEIO;
+ fc->error = -EREMOTEIO;
goto next_server;
}
@@ -187,12 +185,12 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
write_unlock(&vnode->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_check_volume_status(vnode->volume, fc->key);
+ if (error < 0)
+ goto failed_set_error;
if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
- fc->ac.error = -ENOMEDIUM;
+ fc->error = -ENOMEDIUM;
goto failed;
}
@@ -200,7 +198,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* it's the fileserver having trouble.
*/
if (vnode->volume->servers == fc->server_list) {
- fc->ac.error = -EREMOTEIO;
+ fc->error = -EREMOTEIO;
goto next_server;
}
@@ -215,7 +213,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
case VONLINE:
case VDISKFULL:
case VOVERQUOTA:
- fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
goto next_server;
case VOFFLINE:
@@ -224,11 +222,11 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
}
if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
- fc->ac.error = -EADV;
+ fc->error = -EADV;
goto failed;
}
if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
goto failed;
}
goto busy;
@@ -240,7 +238,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* have a file lock we need to maintain.
*/
if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
- fc->ac.error = -EBUSY;
+ fc->error = -EBUSY;
goto failed;
}
if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
@@ -269,16 +267,16 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* honour, just in case someone sets up a loop.
*/
if (fc->flags & AFS_FS_CURSOR_VMOVED) {
- fc->ac.error = -EREMOTEIO;
+ fc->error = -EREMOTEIO;
goto failed;
}
fc->flags |= AFS_FS_CURSOR_VMOVED;
set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_check_volume_status(vnode->volume, fc->key);
+ if (error < 0)
+ goto failed_set_error;
/* If the server list didn't change, then the VLDB is
* out of sync with the fileservers. This is hopefully
@@ -290,7 +288,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
* TODO: Retry a few times with sleeps.
*/
if (vnode->volume->servers == fc->server_list) {
- fc->ac.error = -ENOMEDIUM;
+ fc->error = -ENOMEDIUM;
goto failed;
}
@@ -299,20 +297,25 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
default:
clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
- fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
goto failed;
}
+ case -ETIMEDOUT:
+ case -ETIME:
+ if (fc->error != -EDESTADDRREQ)
+ goto iterate_address;
+ /* Fall through */
case -ENETUNREACH:
case -EHOSTUNREACH:
case -ECONNREFUSED:
- case -ETIMEDOUT:
- case -ETIME:
_debug("no conn");
+ fc->error = error;
goto iterate_address;
case -ECONNRESET:
_debug("call reset");
+ fc->error = error;
goto failed;
}
@@ -328,15 +331,57 @@ start:
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
- fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_check_volume_status(vnode->volume, fc->key);
+ if (error < 0)
+ goto failed_set_error;
if (!afs_start_fs_iteration(fc, vnode))
goto failed;
-use_server:
- _debug("use");
+ _debug("__ VOL %llx __", vnode->volume->vid);
+ error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
+ if (error < 0)
+ goto failed_set_error;
+
+pick_server:
+ _debug("pick [%lx]", fc->untried);
+
+ error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
+ if (error < 0)
+ goto failed_set_error;
+
+ /* Pick the untried server with the lowest RTT. If we have outstanding
+ * callbacks, we stick with the server we're already using if we can.
+ */
+ if (fc->cbi) {
+ _debug("cbi %u", fc->index);
+ if (test_bit(fc->index, &fc->untried))
+ goto selected_server;
+ afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+ fc->cbi = NULL;
+ _debug("nocbi");
+ }
+
+ fc->index = -1;
+ rtt = U32_MAX;
+ for (i = 0; i < fc->server_list->nr_servers; i++) {
+ struct afs_server *s = fc->server_list->servers[i].server;
+
+ if (!test_bit(i, &fc->untried) || !s->probe.responded)
+ continue;
+ if (s->probe.rtt < rtt) {
+ fc->index = i;
+ rtt = s->probe.rtt;
+ }
+ }
+
+ if (fc->index == -1)
+ goto no_more_servers;
+
+selected_server:
+ _debug("use %d", fc->index);
+ __clear_bit(fc->index, &fc->untried);
+
/* We're starting on a different fileserver from the list. We need to
* check it, create a callback intercept, find its address list and
* probe its capabilities before we use it.
@@ -354,10 +399,10 @@ use_server:
* break request before we've finished decoding the reply and
* installing the vnode.
*/
- fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
- fc->index);
- if (fc->ac.error < 0)
- goto failed;
+ error = afs_register_server_cb_interest(vnode, fc->server_list,
+ fc->index);
+ if (error < 0)
+ goto failed_set_error;
fc->cbi = afs_get_cb_interest(vnode->cb_interest);
@@ -369,66 +414,88 @@ use_server:
memset(&fc->ac, 0, sizeof(fc->ac));
- /* Probe the current fileserver if we haven't done so yet. */
- if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
- fc->ac.alist = afs_get_addrlist(alist);
-
- if (!afs_probe_fileserver(fc)) {
- switch (fc->ac.error) {
- case -ENOMEM:
- case -ERESTARTSYS:
- case -EINTR:
- goto failed;
- default:
- goto next_server;
- }
- }
- }
-
if (!fc->ac.alist)
fc->ac.alist = alist;
else
afs_put_addrlist(alist);
- fc->ac.start = READ_ONCE(alist->index);
- fc->ac.index = fc->ac.start;
+ fc->ac.index = -1;
iterate_address:
ASSERT(fc->ac.alist);
- _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
if (!afs_iterate_addresses(&fc->ac))
goto next_server;
+ _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
+
_leave(" = t");
return true;
next_server:
_debug("next");
afs_end_cursor(&fc->ac);
- afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
- fc->cbi = NULL;
- fc->index++;
- if (fc->index >= fc->server_list->nr_servers)
- fc->index = 0;
- if (fc->index != fc->start)
- goto use_server;
+ goto pick_server;
+no_more_servers:
/* That's all the servers poked to no good effect. Try again if some
* of them were busy.
*/
if (fc->flags & AFS_FS_CURSOR_VBUSY)
goto restart_from_beginning;
- fc->ac.error = -EDESTADDRREQ;
- goto failed;
+ abort_code = 0;
+ error = -EDESTADDRREQ;
+ for (i = 0; i < fc->server_list->nr_servers; i++) {
+ struct afs_server *s = fc->server_list->servers[i].server;
+ int probe_error = READ_ONCE(s->probe.error);
+
+ switch (probe_error) {
+ case 0:
+ continue;
+ default:
+ if (error == -ETIMEDOUT ||
+ error == -ETIME)
+ continue;
+ case -ETIMEDOUT:
+ case -ETIME:
+ if (error == -ENOMEM ||
+ error == -ENONET)
+ continue;
+ case -ENOMEM:
+ case -ENONET:
+ if (error == -ENETUNREACH)
+ continue;
+ case -ENETUNREACH:
+ if (error == -EHOSTUNREACH)
+ continue;
+ case -EHOSTUNREACH:
+ if (error == -ECONNREFUSED)
+ continue;
+ case -ECONNREFUSED:
+ if (error == -ECONNRESET)
+ continue;
+ case -ECONNRESET: /* Responded, but call expired. */
+ if (error == -ECONNABORTED)
+ continue;
+ case -ECONNABORTED:
+ abort_code = s->probe.abort_code;
+ error = probe_error;
+ continue;
+ }
+ }
+
+ if (error == -ECONNABORTED)
+ error = afs_abort_to_error(abort_code);
+failed_set_error:
+ fc->error = error;
failed:
fc->flags |= AFS_FS_CURSOR_STOP;
afs_end_cursor(&fc->ac);
- _leave(" = f [failed %d]", fc->ac.error);
+ _leave(" = f [failed %d]", fc->error);
return false;
}
@@ -442,13 +509,14 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
struct afs_vnode *vnode = fc->vnode;
struct afs_cb_interest *cbi = vnode->cb_interest;
struct afs_addr_list *alist;
+ int error = fc->ac.error;
_enter("");
- switch (fc->ac.error) {
+ switch (error) {
case SHRT_MAX:
if (!cbi) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
fc->flags |= AFS_FS_CURSOR_STOP;
return false;
}
@@ -461,25 +529,26 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
afs_get_addrlist(alist);
read_unlock(&cbi->server->fs_lock);
if (!alist) {
- fc->ac.error = -ESTALE;
+ fc->error = -ESTALE;
fc->flags |= AFS_FS_CURSOR_STOP;
return false;
}
memset(&fc->ac, 0, sizeof(fc->ac));
fc->ac.alist = alist;
- fc->ac.start = READ_ONCE(alist->index);
- fc->ac.index = fc->ac.start;
+ fc->ac.index = -1;
goto iterate_address;
case 0:
default:
/* Success or local failure. Stop. */
+ fc->error = error;
fc->flags |= AFS_FS_CURSOR_STOP;
- _leave(" = f [okay/local %d]", fc->ac.error);
+ _leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
fc->flags |= AFS_FS_CURSOR_STOP;
_leave(" = f [abort]");
return false;
@@ -490,6 +559,7 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
case -ETIMEDOUT:
case -ETIME:
_debug("no conn");
+ fc->error = error;
goto iterate_address;
}
@@ -507,12 +577,65 @@ iterate_address:
}
/*
+ * Dump cursor state in the case of the error being EDESTADDRREQ.
+ */
+static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
+{
+ static int count;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
+ return;
+ count++;
+
+ rcu_read_lock();
+
+ pr_notice("EDESTADDR occurred\n");
+ pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
+ fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
+ pr_notice("FC: ut=%lx ix=%d ni=%u\n",
+ fc->untried, fc->index, fc->nr_iterations);
+
+ if (fc->server_list) {
+ const struct afs_server_list *sl = fc->server_list;
+ pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
+ sl->nr_servers, sl->preferred, sl->vnovol_mask);
+ for (i = 0; i < sl->nr_servers; i++) {
+ const struct afs_server *s = sl->servers[i].server;
+ pr_notice("FC: server fl=%lx av=%u %pU\n",
+ s->flags, s->addr_version, &s->uuid);
+ if (s->addresses) {
+ const struct afs_addr_list *a =
+ rcu_dereference(s->addresses);
+ pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
+ a->version,
+ a->nr_ipv4, a->nr_addrs, a->max_addrs,
+ a->preferred);
+ pr_notice("FC: - pr=%lx R=%lx F=%lx\n",
+ a->probed, a->responded, a->failed);
+ if (a == fc->ac.alist)
+ pr_notice("FC: - current\n");
+ }
+ }
+ }
+
+ pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+ fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
+ fc->ac.responded, fc->ac.nr_iterations);
+ rcu_read_unlock();
+}
+
+/*
* Tidy up a filesystem cursor and unlock the vnode.
*/
int afs_end_vnode_operation(struct afs_fs_cursor *fc)
{
struct afs_net *net = afs_v2net(fc->vnode);
- int ret;
+
+ if (fc->error == -EDESTADDRREQ ||
+ fc->error == -ENETUNREACH ||
+ fc->error == -EHOSTUNREACH)
+ afs_dump_edestaddrreq(fc);
mutex_unlock(&fc->vnode->io_lock);
@@ -520,9 +643,8 @@ int afs_end_vnode_operation(struct afs_fs_cursor *fc)
afs_put_cb_interest(net, fc->cbi);
afs_put_serverlist(net, fc->server_list);
- ret = fc->ac.error;
- if (ret == -ECONNABORTED)
- afs_abort_to_error(fc->ac.abort_code);
+ if (fc->error == -ECONNABORTED)
+ fc->error = afs_abort_to_error(fc->ac.abort_code);
- return fc->ac.error;
+ return fc->error;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 35f2ae30f31f..59970886690f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -16,6 +16,7 @@
#include <net/af_rxrpc.h>
#include "internal.h"
#include "afs_cm.h"
+#include "protocol_yfs.h"
struct workqueue_struct *afs_async_calls;
@@ -75,6 +76,18 @@ int afs_open_socket(struct afs_net *net)
if (ret < 0)
goto error_2;
+ srx.srx_service = YFS_CM_SERVICE;
+ ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+ if (ret < 0)
+ goto error_2;
+
+ /* Ideally, we'd turn on service upgrade here, but we can't because
+ * OpenAFS is buggy and leaks the userStatus field from packet to
+ * packet and between FS packets and CB packets - so if we try to do an
+ * upgrade on an FS packet, OpenAFS will leak that into the CB packet
+ * it sends back to us.
+ */
+
rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
afs_rx_discard_new_call);
@@ -143,6 +156,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
+ call->_iter = &call->iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
trace_afs_call(call, afs_call_trace_alloc, 1, o,
@@ -176,6 +190,7 @@ void afs_put_call(struct afs_call *call)
afs_put_server(call->net, call->cm_server);
afs_put_cb_interest(call->net, call->cbi);
+ afs_put_addrlist(call->alist);
kfree(call->request);
trace_afs_call(call, afs_call_trace_free, 0, o,
@@ -189,21 +204,22 @@ void afs_put_call(struct afs_call *call)
}
/*
- * Queue the call for actual work. Returns 0 unconditionally for convenience.
+ * Queue the call for actual work.
*/
-int afs_queue_call_work(struct afs_call *call)
+static void afs_queue_call_work(struct afs_call *call)
{
- int u = atomic_inc_return(&call->usage);
+ if (call->type->work) {
+ int u = atomic_inc_return(&call->usage);
- trace_afs_call(call, afs_call_trace_work, u,
- atomic_read(&call->net->nr_outstanding_calls),
- __builtin_return_address(0));
+ trace_afs_call(call, afs_call_trace_work, u,
+ atomic_read(&call->net->nr_outstanding_calls),
+ __builtin_return_address(0));
- INIT_WORK(&call->work, call->type->work);
+ INIT_WORK(&call->work, call->type->work);
- if (!queue_work(afs_wq, &call->work))
- afs_put_call(call);
- return 0;
+ if (!queue_work(afs_wq, &call->work))
+ afs_put_call(call);
+ }
}
/*
@@ -233,6 +249,7 @@ struct afs_call *afs_alloc_flat_call(struct afs_net *net,
goto nomem_free;
}
+ afs_extract_to_buf(call, call->reply_max);
call->operation_ID = type->op;
init_waitqueue_head(&call->waitq);
return call;
@@ -286,7 +303,7 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
offset = 0;
}
- iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes);
+ iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes);
}
/*
@@ -342,7 +359,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
gfp_t gfp, bool async)
{
- struct sockaddr_rxrpc *srx = ac->addr;
+ struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
@@ -359,6 +376,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
atomic_read(&call->net->nr_outstanding_calls));
call->async = async;
+ call->addr_ix = ac->index;
+ call->alist = afs_get_addrlist(ac->alist);
/* Work out the length we're going to transmit. This is awkward for
* calls such as FS.StoreData where there's an extra injection of data
@@ -390,6 +409,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
call->debug_id);
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
+ call->error = ret;
goto error_kill_call;
}
@@ -401,8 +421,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
- call->request_size);
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
@@ -432,7 +451,7 @@ error_do_abort:
rxrpc_kernel_abort_call(call->net->socket, rxcall,
RX_USER_ABORT, ret, "KSD");
} else {
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, false,
&call->abort_code, &call->service_id);
@@ -442,6 +461,8 @@ error_do_abort:
call->error = ret;
trace_afs_call_done(call);
error_kill_call:
+ if (call->type->done)
+ call->type->done(call);
afs_put_call(call);
ac->error = ret;
_leave(" = %d", ret);
@@ -466,14 +487,12 @@ static void afs_deliver_to_call(struct afs_call *call)
state == AFS_CALL_SV_AWAIT_ACK
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
- struct iov_iter iter;
-
- iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0);
+ iov_iter_kvec(&call->iter, READ, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
- call->rxcall, &iter, false,
- &remote_abort,
+ call->rxcall, &call->iter,
+ false, &remote_abort,
&call->service_id);
- trace_afs_recv_data(call, 0, 0, false, ret);
+ trace_afs_receive_data(call, &call->iter, false, ret);
if (ret == -EINPROGRESS || ret == -EAGAIN)
return;
@@ -485,10 +504,17 @@ static void afs_deliver_to_call(struct afs_call *call)
return;
}
+ if (call->want_reply_time &&
+ rxrpc_kernel_get_reply_time(call->net->socket,
+ call->rxcall,
+ &call->reply_time))
+ call->want_reply_time = false;
+
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
switch (ret) {
case 0:
+ afs_queue_call_work(call);
if (state == AFS_CALL_CL_PROC_REPLY) {
if (call->cbi)
set_bit(AFS_SERVER_FL_MAY_HAVE_CB,
@@ -500,7 +526,6 @@ static void afs_deliver_to_call(struct afs_call *call)
case -EINPROGRESS:
case -EAGAIN:
goto out;
- case -EIO:
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
goto done;
@@ -509,6 +534,10 @@ static void afs_deliver_to_call(struct afs_call *call)
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KIV");
goto local_abort;
+ case -EIO:
+ pr_err("kAFS: Call %u in bad state %u\n",
+ call->debug_id, state);
+ /* Fall through */
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
@@ -517,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call)
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, -EBADMSG, "KUM");
+ abort_code, ret, "KUM");
goto local_abort;
}
}
done:
+ if (call->type->done)
+ call->type->done(call);
if (state == AFS_CALL_COMPLETE && call->incoming)
afs_put_call(call);
out:
@@ -690,8 +721,6 @@ static void afs_process_async_call(struct work_struct *work)
}
if (call->state == AFS_CALL_COMPLETE) {
- call->reply[0] = NULL;
-
/* We have two refs to release - one from the alloc and one
* queued with the work item - and we can't just deallocate the
* call because the work item may be queued again.
@@ -730,6 +759,7 @@ void afs_charge_preallocation(struct work_struct *work)
call->async = true;
call->state = AFS_CALL_SV_AWAIT_OP_ID;
init_waitqueue_head(&call->waitq);
+ afs_extract_to_tmp(call);
}
if (rxrpc_kernel_charge_accept(net->socket,
@@ -775,18 +805,15 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
{
int ret;
- _enter("{%zu}", call->offset);
-
- ASSERTCMP(call->offset, <, 4);
+ _enter("{%zu}", iov_iter_count(call->_iter));
/* the operation ID forms the first four bytes of the request data */
- ret = afs_extract_data(call, &call->tmp, 4, true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
call->operation_ID = ntohl(call->tmp);
afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST);
- call->offset = 0;
/* ask the cache manager to route the call (it'll change the call type
* if successful) */
@@ -827,7 +854,7 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -866,7 +893,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -890,30 +917,19 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
/*
* Extract a piece of data from the received data socket buffers.
*/
-int afs_extract_data(struct afs_call *call, void *buf, size_t count,
- bool want_more)
+int afs_extract_data(struct afs_call *call, bool want_more)
{
struct afs_net *net = call->net;
- struct iov_iter iter;
- struct kvec iov;
+ struct iov_iter *iter = call->_iter;
enum afs_call_state state;
u32 remote_abort = 0;
int ret;
- _enter("{%s,%zu},,%zu,%d",
- call->type->name, call->offset, count, want_more);
-
- ASSERTCMP(call->offset, <=, count);
-
- iov.iov_base = buf + call->offset;
- iov.iov_len = count - call->offset;
- iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset);
+ _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more);
- ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter,
+ ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
want_more, &remote_abort,
&call->service_id);
- call->offset += (count - call->offset) - iov_iter_count(&iter);
- trace_afs_recv_data(call, count, call->offset, want_more, ret);
if (ret == 0 || ret == -EAGAIN)
return ret;
@@ -928,7 +944,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
break;
case AFS_CALL_COMPLETE:
kdebug("prem complete %d", call->error);
- return -EIO;
+ return afs_io_error(call, afs_io_error_extract);
default:
break;
}
@@ -942,8 +958,9 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
/*
* Log protocol error production.
*/
-noinline int afs_protocol_error(struct afs_call *call, int error)
+noinline int afs_protocol_error(struct afs_call *call, int error,
+ enum afs_eproto_cause cause)
{
- trace_afs_protocol_error(call, error, __builtin_return_address(0));
+ trace_afs_protocol_error(call, error, cause);
return error;
}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 81dfedb7879f..5f58a9a17e69 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -126,7 +126,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
bool changed = false;
int i, j;
- _enter("{%x:%u},%x,%x",
+ _enter("{%llx:%llu},%x,%x",
vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access);
rcu_read_lock();
@@ -147,7 +147,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
break;
}
- if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) {
+ if (afs_cb_is_broken(cb_break, vnode,
+ vnode->cb_interest)) {
changed = true;
break;
}
@@ -177,7 +178,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
}
}
- if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest))
+ if (afs_cb_is_broken(cb_break, vnode, vnode->cb_interest))
goto someone_else_changed_it;
/* We need a ref on any permits list we want to copy as we'll have to
@@ -256,7 +257,7 @@ found:
spin_lock(&vnode->lock);
zap = rcu_access_pointer(vnode->permit_cache);
- if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) &&
+ if (!afs_cb_is_broken(cb_break, vnode, vnode->cb_interest) &&
zap == permits)
rcu_assign_pointer(vnode->permit_cache, replacement);
else
@@ -289,7 +290,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
bool valid = false;
int i, ret;
- _enter("{%x:%u},%x",
+ _enter("{%llx:%llu},%x",
vnode->fid.vid, vnode->fid.vnode, key_serial(key));
/* check the permits to see if we've got one yet */
@@ -349,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- _enter("{{%x:%u},%lx},%x,",
+ _enter("{{%llx:%llu},%lx},%x,",
vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
key = afs_request_key(vnode->volume->cell);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 1d329e6981d5..642afa2e9783 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include "afs_fs.h"
#include "internal.h"
+#include "protocol_yfs.h"
static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */
static unsigned afs_server_update_delay = 30; /* Time till VLDB recheck in secs */
@@ -230,6 +231,8 @@ static struct afs_server *afs_alloc_server(struct afs_net *net,
rwlock_init(&server->fs_lock);
INIT_HLIST_HEAD(&server->cb_volumes);
rwlock_init(&server->cb_break_lock);
+ init_waitqueue_head(&server->probe_wq);
+ spin_lock_init(&server->probe_lock);
afs_inc_servers_outstanding(net);
_leave(" = %p", server);
@@ -246,41 +249,23 @@ enomem:
static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
struct key *key, const uuid_t *uuid)
{
- struct afs_addr_cursor ac;
- struct afs_addr_list *alist;
+ struct afs_vl_cursor vc;
+ struct afs_addr_list *alist = NULL;
int ret;
- ret = afs_set_vl_cursor(&ac, cell);
- if (ret < 0)
- return ERR_PTR(ret);
-
- while (afs_iterate_addresses(&ac)) {
- if (test_bit(ac.index, &ac.alist->yfs))
- alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid);
- else
- alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid);
- switch (ac.error) {
- case 0:
- afs_end_cursor(&ac);
- return alist;
- case -ECONNABORTED:
- ac.error = afs_abort_to_error(ac.abort_code);
- goto error;
- case -ENOMEM:
- case -ENONET:
- goto error;
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- break;
- default:
- ac.error = -EIO;
- goto error;
+ ret = -ERESTARTSYS;
+ if (afs_begin_vlserver_operation(&vc, cell, key)) {
+ while (afs_select_vlserver(&vc)) {
+ if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
+ alist = afs_yfsvl_get_endpoints(&vc, uuid);
+ else
+ alist = afs_vl_get_addrs_u(&vc, uuid);
}
+
+ ret = afs_end_vlserver_operation(&vc);
}
-error:
- return ERR_PTR(afs_end_cursor(&ac));
+ return ret < 0 ? ERR_PTR(ret) : alist;
}
/*
@@ -382,9 +367,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
struct afs_addr_cursor ac = {
.alist = alist,
- .start = alist->index,
- .index = 0,
- .addr = &alist->addrs[alist->index],
+ .index = alist->preferred,
.error = 0,
};
_enter("%p", server);
@@ -392,6 +375,9 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+ wait_var_event(&server->probe_outstanding,
+ atomic_read(&server->probe_outstanding) == 0);
+
call_rcu(&server->rcu, afs_server_rcu);
afs_dec_servers_outstanding(net);
}
@@ -525,99 +511,6 @@ void afs_purge_servers(struct afs_net *net)
}
/*
- * Probe a fileserver to find its capabilities.
- *
- * TODO: Try service upgrade.
- */
-static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
-{
- _enter("");
-
- fc->ac.addr = NULL;
- fc->ac.start = READ_ONCE(fc->ac.alist->index);
- fc->ac.index = fc->ac.start;
- fc->ac.error = 0;
- fc->ac.begun = false;
-
- while (afs_iterate_addresses(&fc->ac)) {
- afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
- &fc->ac, fc->key);
- switch (fc->ac.error) {
- case 0:
- afs_end_cursor(&fc->ac);
- set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
- return true;
- case -ECONNABORTED:
- fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
- goto error;
- case -ENOMEM:
- case -ENONET:
- goto error;
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- case -ETIMEDOUT:
- case -ETIME:
- break;
- default:
- fc->ac.error = -EIO;
- goto error;
- }
- }
-
-error:
- afs_end_cursor(&fc->ac);
- return false;
-}
-
-/*
- * If we haven't already, try probing the fileserver to get its capabilities.
- * We try not to instigate parallel probes, but it's possible that the parallel
- * probes will fail due to authentication failure when ours would succeed.
- *
- * TODO: Try sending an anonymous probe if an authenticated probe fails.
- */
-bool afs_probe_fileserver(struct afs_fs_cursor *fc)
-{
- bool success;
- int ret, retries = 0;
-
- _enter("");
-
-retry:
- if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
- _leave(" = t");
- return true;
- }
-
- if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
- success = afs_do_probe_fileserver(fc);
- clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
- wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
- _leave(" = t");
- return success;
- }
-
- _debug("wait");
- ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
- TASK_INTERRUPTIBLE);
- if (ret == -ERESTARTSYS) {
- fc->ac.error = ret;
- _leave(" = f [%d]", ret);
- return false;
- }
-
- retries++;
- if (retries == 4) {
- fc->ac.error = -ESTALE;
- _leave(" = f [stale]");
- return false;
- }
- _debug("retry");
- goto retry;
-}
-
-/*
* Get an update for a server's address list.
*/
static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 8a5760aa5832..95d0761cdb34 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -118,11 +118,11 @@ bool afs_annotate_server_list(struct afs_server_list *new,
return false;
changed:
- /* Maintain the same current server as before if possible. */
- cur = old->servers[old->index].server;
+ /* Maintain the same preferred server as before if possible. */
+ cur = old->servers[old->preferred].server;
for (j = 0; j < new->nr_servers; j++) {
if (new->servers[j].server == cur) {
- new->index = j;
+ new->preferred = j;
break;
}
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 4d3e274207fb..dcd07fe99871 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -406,10 +406,11 @@ static int afs_fill_super(struct super_block *sb,
inode = afs_iget_pseudo_dir(sb, true);
sb->s_flags |= SB_RDONLY;
} else {
- sprintf(sb->s_id, "%u", as->volume->vid);
+ sprintf(sb->s_id, "%llu", as->volume->vid);
afs_activate_volume(as->volume);
fid.vid = as->volume->vid;
fid.vnode = 1;
+ fid.vnode_hi = 0;
fid.unique = 1;
inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
}
@@ -663,7 +664,7 @@ static void afs_destroy_inode(struct inode *inode)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
- _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
+ _enter("%p{%llx:%llu}", inode, vnode->fid.vid, vnode->fid.vnode);
_debug("DESTROY INODE %p", inode);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
new file mode 100644
index 000000000000..b4f1a84519b9
--- /dev/null
+++ b/fs/afs/vl_list.c
@@ -0,0 +1,340 @@
+/* AFS vlserver list management.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
+ unsigned short port)
+{
+ struct afs_vlserver *vlserver;
+
+ vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
+ GFP_KERNEL);
+ if (vlserver) {
+ atomic_set(&vlserver->usage, 1);
+ rwlock_init(&vlserver->lock);
+ init_waitqueue_head(&vlserver->probe_wq);
+ spin_lock_init(&vlserver->probe_lock);
+ vlserver->name_len = name_len;
+ vlserver->port = port;
+ memcpy(vlserver->name, name, name_len);
+ }
+ return vlserver;
+}
+
+static void afs_vlserver_rcu(struct rcu_head *rcu)
+{
+ struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu);
+
+ afs_put_addrlist(rcu_access_pointer(vlserver->addresses));
+ kfree_rcu(vlserver, rcu);
+}
+
+void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
+{
+ if (vlserver) {
+ unsigned int u = atomic_dec_return(&vlserver->usage);
+ //_debug("VL PUT %p{%u}", vlserver, u);
+
+ if (u == 0)
+ call_rcu(&vlserver->rcu, afs_vlserver_rcu);
+ }
+}
+
+struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
+{
+ struct afs_vlserver_list *vllist;
+
+ vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
+ if (vllist) {
+ atomic_set(&vllist->usage, 1);
+ rwlock_init(&vllist->lock);
+ }
+
+ return vllist;
+}
+
+void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
+{
+ if (vllist) {
+ unsigned int u = atomic_dec_return(&vllist->usage);
+
+ //_debug("VLLS PUT %p{%u}", vllist, u);
+ if (u == 0) {
+ int i;
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ afs_put_vlserver(net, vllist->servers[i].server);
+ }
+ kfree_rcu(vllist, rcu);
+ }
+ }
+}
+
+static u16 afs_extract_le16(const u8 **_b)
+{
+ u16 val;
+
+ val = (u16)*(*_b)++ << 0;
+ val |= (u16)*(*_b)++ << 8;
+ return val;
+}
+
+/*
+ * Build a VL server address list from a DNS queried server list.
+ */
+static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+ u8 nr_addrs, u16 port)
+{
+ struct afs_addr_list *alist;
+ const u8 *b = *_b;
+ int ret = -EINVAL;
+
+ alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
+ if (!alist)
+ return ERR_PTR(-ENOMEM);
+ if (nr_addrs == 0)
+ return alist;
+
+ for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) {
+ struct dns_server_list_v1_address hdr;
+ __be32 x[4];
+
+ hdr.address_type = *b++;
+
+ switch (hdr.address_type) {
+ case DNS_ADDRESS_IS_IPV4:
+ if (end - b < 4) {
+ _leave(" = -EINVAL [short inet]");
+ goto error;
+ }
+ memcpy(x, b, 4);
+ afs_merge_fs_addr4(alist, x[0], port);
+ b += 4;
+ break;
+
+ case DNS_ADDRESS_IS_IPV6:
+ if (end - b < 16) {
+ _leave(" = -EINVAL [short inet6]");
+ goto error;
+ }
+ memcpy(x, b, 16);
+ afs_merge_fs_addr6(alist, x, port);
+ b += 16;
+ break;
+
+ default:
+ _leave(" = -EADDRNOTAVAIL [unknown af %u]",
+ hdr.address_type);
+ ret = -EADDRNOTAVAIL;
+ goto error;
+ }
+ }
+
+ /* Start with IPv6 if available. */
+ if (alist->nr_ipv4 < alist->nr_addrs)
+ alist->preferred = alist->nr_ipv4;
+
+ *_b = b;
+ return alist;
+
+error:
+ *_b = b;
+ afs_put_addrlist(alist);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Build a VL server list from a DNS queried server list.
+ */
+struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
+ const void *buffer,
+ size_t buffer_size)
+{
+ const struct dns_server_list_v1_header *hdr = buffer;
+ struct dns_server_list_v1_server bs;
+ struct afs_vlserver_list *vllist, *previous;
+ struct afs_addr_list *addrs;
+ struct afs_vlserver *server;
+ const u8 *b = buffer, *end = buffer + buffer_size;
+ int ret = -ENOMEM, nr_servers, i, j;
+
+ _enter("");
+
+ /* Check that it's a server list, v1 */
+ if (end - b < sizeof(*hdr) ||
+ hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST ||
+ hdr->hdr.version != 1) {
+ pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n",
+ hdr->hdr.content, hdr->hdr.version, end - b);
+ ret = -EDESTADDRREQ;
+ goto dump;
+ }
+
+ nr_servers = hdr->nr_servers;
+
+ vllist = afs_alloc_vlserver_list(nr_servers);
+ if (!vllist)
+ return ERR_PTR(-ENOMEM);
+
+ vllist->source = (hdr->source < NR__dns_record_source) ?
+ hdr->source : NR__dns_record_source;
+ vllist->status = (hdr->status < NR__dns_lookup_status) ?
+ hdr->status : NR__dns_lookup_status;
+
+ read_lock(&cell->vl_servers_lock);
+ previous = afs_get_vlserverlist(
+ rcu_dereference_protected(cell->vl_servers,
+ lockdep_is_held(&cell->vl_servers_lock)));
+ read_unlock(&cell->vl_servers_lock);
+
+ b += sizeof(*hdr);
+ while (end - b >= sizeof(bs)) {
+ bs.name_len = afs_extract_le16(&b);
+ bs.priority = afs_extract_le16(&b);
+ bs.weight = afs_extract_le16(&b);
+ bs.port = afs_extract_le16(&b);
+ bs.source = *b++;
+ bs.status = *b++;
+ bs.protocol = *b++;
+ bs.nr_addrs = *b++;
+
+ _debug("extract %u %u %u %u %u %u %*.*s",
+ bs.name_len, bs.priority, bs.weight,
+ bs.port, bs.protocol, bs.nr_addrs,
+ bs.name_len, bs.name_len, b);
+
+ if (end - b < bs.name_len)
+ break;
+
+ ret = -EPROTONOSUPPORT;
+ if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) {
+ bs.protocol = DNS_SERVER_PROTOCOL_UDP;
+ } else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) {
+ _leave(" = [proto %u]", bs.protocol);
+ goto error;
+ }
+
+ if (bs.port == 0)
+ bs.port = AFS_VL_PORT;
+ if (bs.source > NR__dns_record_source)
+ bs.source = NR__dns_record_source;
+ if (bs.status > NR__dns_lookup_status)
+ bs.status = NR__dns_lookup_status;
+
+ server = NULL;
+ if (previous) {
+ /* See if we can update an old server record */
+ for (i = 0; i < previous->nr_servers; i++) {
+ struct afs_vlserver *p = previous->servers[i].server;
+
+ if (p->name_len == bs.name_len &&
+ p->port == bs.port &&
+ strncasecmp(b, p->name, bs.name_len) == 0) {
+ server = afs_get_vlserver(p);
+ break;
+ }
+ }
+ }
+
+ if (!server) {
+ ret = -ENOMEM;
+ server = afs_alloc_vlserver(b, bs.name_len, bs.port);
+ if (!server)
+ goto error;
+ }
+
+ b += bs.name_len;
+
+ /* Extract the addresses - note that we can't skip this as we
+ * have to advance the payload pointer.
+ */
+ addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
+ if (IS_ERR(addrs)) {
+ ret = PTR_ERR(addrs);
+ goto error_2;
+ }
+
+ if (vllist->nr_servers >= nr_servers) {
+ _debug("skip %u >= %u", vllist->nr_servers, nr_servers);
+ afs_put_addrlist(addrs);
+ afs_put_vlserver(cell->net, server);
+ continue;
+ }
+
+ addrs->source = bs.source;
+ addrs->status = bs.status;
+
+ if (addrs->nr_addrs == 0) {
+ afs_put_addrlist(addrs);
+ if (!rcu_access_pointer(server->addresses)) {
+ afs_put_vlserver(cell->net, server);
+ continue;
+ }
+ } else {
+ struct afs_addr_list *old = addrs;
+
+ write_lock(&server->lock);
+ rcu_swap_protected(server->addresses, old,
+ lockdep_is_held(&server->lock));
+ write_unlock(&server->lock);
+ afs_put_addrlist(old);
+ }
+
+
+ /* TODO: Might want to check for duplicates */
+
+ /* Insertion-sort by priority and weight */
+ for (j = 0; j < vllist->nr_servers; j++) {
+ if (bs.priority < vllist->servers[j].priority)
+ break; /* Lower preferable */
+ if (bs.priority == vllist->servers[j].priority &&
+ bs.weight > vllist->servers[j].weight)
+ break; /* Higher preferable */
+ }
+
+ if (j < vllist->nr_servers) {
+ memmove(vllist->servers + j + 1,
+ vllist->servers + j,
+ (vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry));
+ }
+
+ clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+
+ vllist->servers[j].priority = bs.priority;
+ vllist->servers[j].weight = bs.weight;
+ vllist->servers[j].server = server;
+ vllist->nr_servers++;
+ }
+
+ if (b != end) {
+ _debug("parse error %zd", b - end);
+ goto error;
+ }
+
+ afs_put_vlserverlist(cell->net, previous);
+ _leave(" = ok [%u]", vllist->nr_servers);
+ return vllist;
+
+error_2:
+ afs_put_vlserver(cell->net, server);
+error:
+ afs_put_vlserverlist(cell->net, vllist);
+ afs_put_vlserverlist(cell->net, previous);
+dump:
+ if (ret != -ENOMEM) {
+ printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer);
+ print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size);
+ }
+ return ERR_PTR(ret);
+}
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
new file mode 100644
index 000000000000..c0f616bd70cb
--- /dev/null
+++ b/fs/afs/vl_probe.c
@@ -0,0 +1,273 @@
+/* AFS vlserver probing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "afs_fs.h"
+#include "internal.h"
+#include "protocol_yfs.h"
+
+static bool afs_vl_probe_done(struct afs_vlserver *server)
+{
+ if (!atomic_dec_and_test(&server->probe_outstanding))
+ return false;
+
+ wake_up_var(&server->probe_outstanding);
+ clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
+ wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
+ return true;
+}
+
+/*
+ * Process the result of probing a vlserver. This is called after successful
+ * or failed delivery of an VL.GetCapabilities operation.
+ */
+void afs_vlserver_probe_result(struct afs_call *call)
+{
+ struct afs_addr_list *alist = call->alist;
+ struct afs_vlserver *server = call->reply[0];
+ unsigned int server_index = (long)call->reply[1];
+ unsigned int index = call->addr_ix;
+ unsigned int rtt = UINT_MAX;
+ bool have_result = false;
+ u64 _rtt;
+ int ret = call->error;
+
+ _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code);
+
+ spin_lock(&server->probe_lock);
+
+ switch (ret) {
+ case 0:
+ server->probe.error = 0;
+ goto responded;
+ case -ECONNABORTED:
+ if (!server->probe.responded) {
+ server->probe.abort_code = call->abort_code;
+ server->probe.error = ret;
+ }
+ goto responded;
+ case -ENOMEM:
+ case -ENONET:
+ server->probe.local_failure = true;
+ afs_io_error(call, afs_io_error_vl_probe_fail);
+ goto out;
+ case -ECONNRESET: /* Responded, but call expired. */
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ETIME:
+ default:
+ clear_bit(index, &alist->responded);
+ set_bit(index, &alist->failed);
+ if (!server->probe.responded &&
+ (server->probe.error == 0 ||
+ server->probe.error == -ETIMEDOUT ||
+ server->probe.error == -ETIME))
+ server->probe.error = ret;
+ afs_io_error(call, afs_io_error_vl_probe_fail);
+ goto out;
+ }
+
+responded:
+ set_bit(index, &alist->responded);
+ clear_bit(index, &alist->failed);
+
+ if (call->service_id == YFS_VL_SERVICE) {
+ server->probe.is_yfs = true;
+ set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ } else {
+ server->probe.not_yfs = true;
+ if (!server->probe.is_yfs) {
+ clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+ alist->addrs[index].srx_service = call->service_id;
+ }
+ }
+
+ /* Get the RTT and scale it to fit into a 32-bit value that represents
+ * over a minute of time so that we can access it with one instruction
+ * on a 32-bit system.
+ */
+ _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+ _rtt /= 64;
+ rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
+ if (rtt < server->probe.rtt) {
+ server->probe.rtt = rtt;
+ alist->preferred = index;
+ have_result = true;
+ }
+
+ smp_wmb(); /* Set rtt before responded. */
+ server->probe.responded = true;
+ set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+out:
+ spin_unlock(&server->probe_lock);
+
+ _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+ server_index, index, &alist->addrs[index].transport,
+ (unsigned int)rtt, ret);
+
+ have_result |= afs_vl_probe_done(server);
+ if (have_result) {
+ server->probe.have_result = true;
+ wake_up_var(&server->probe.have_result);
+ wake_up_all(&server->probe_wq);
+ }
+}
+
+/*
+ * Probe all of a vlserver's addresses to find out the best route and to
+ * query its capabilities.
+ */
+static int afs_do_probe_vlserver(struct afs_net *net,
+ struct afs_vlserver *server,
+ struct key *key,
+ unsigned int server_index)
+{
+ struct afs_addr_cursor ac = {
+ .index = 0,
+ };
+ int ret;
+
+ _enter("%s", server->name);
+
+ read_lock(&server->lock);
+ ac.alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->lock));
+ read_unlock(&server->lock);
+
+ atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ memset(&server->probe, 0, sizeof(server->probe));
+ server->probe.rtt = UINT_MAX;
+
+ for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
+ ret = afs_vl_get_capabilities(net, &ac, key, server,
+ server_index, true);
+ if (ret != -EINPROGRESS) {
+ afs_vl_probe_done(server);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Send off probes to all unprobed servers.
+ */
+int afs_send_vl_probes(struct afs_net *net, struct key *key,
+ struct afs_vlserver_list *vllist)
+{
+ struct afs_vlserver *server;
+ int i, ret;
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ server = vllist->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
+ continue;
+
+ if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags)) {
+ ret = afs_do_probe_vlserver(net, server, key, i);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Wait for the first as-yet untried server to respond.
+ */
+int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
+ unsigned long untried)
+{
+ struct wait_queue_entry *waits;
+ struct afs_vlserver *server;
+ unsigned int rtt = UINT_MAX;
+ bool have_responders = false;
+ int pref = -1, i;
+
+ _enter("%u,%lx", vllist->nr_servers, untried);
+
+ /* Only wait for servers that have a probe outstanding. */
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
+ __clear_bit(i, &untried);
+ if (server->probe.responded)
+ have_responders = true;
+ }
+ }
+ if (have_responders || !untried)
+ return 0;
+
+ waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL);
+ if (!waits)
+ return -ENOMEM;
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ init_waitqueue_entry(&waits[i], current);
+ add_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ for (;;) {
+ bool still_probing = false;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ if (server->probe.responded)
+ goto stop;
+ if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
+ still_probing = true;
+ }
+ }
+
+ if (!still_probing || unlikely(signal_pending(current)))
+ goto stop;
+ schedule();
+ }
+
+stop:
+ set_current_state(TASK_RUNNING);
+
+ for (i = 0; i < vllist->nr_servers; i++) {
+ if (test_bit(i, &untried)) {
+ server = vllist->servers[i].server;
+ if (server->probe.responded &&
+ server->probe.rtt < rtt) {
+ pref = i;
+ rtt = server->probe.rtt;
+ }
+
+ remove_wait_queue(&server->probe_wq, &waits[i]);
+ }
+ }
+
+ kfree(waits);
+
+ if (pref == -1 && signal_pending(current))
+ return -ERESTARTSYS;
+
+ if (pref >= 0)
+ vllist->preferred = pref;
+
+ _leave(" = 0 [%u]", pref);
+ return 0;
+}
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
new file mode 100644
index 000000000000..b64a284b99d2
--- /dev/null
+++ b/fs/afs/vl_rotate.c
@@ -0,0 +1,355 @@
+/* Handle vlserver selection and rotation.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include "internal.h"
+#include "afs_vl.h"
+
+/*
+ * Begin an operation on a volume location server.
+ */
+bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
+ struct key *key)
+{
+ memset(vc, 0, sizeof(*vc));
+ vc->cell = cell;
+ vc->key = key;
+ vc->error = -EDESTADDRREQ;
+ vc->ac.error = SHRT_MAX;
+
+ if (signal_pending(current)) {
+ vc->error = -EINTR;
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Begin iteration through a server list, starting with the last used server if
+ * possible, or the last recorded good server if not.
+ */
+static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+{
+ struct afs_cell *cell = vc->cell;
+
+ if (wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
+ TASK_INTERRUPTIBLE)) {
+ vc->error = -ERESTARTSYS;
+ return false;
+ }
+
+ read_lock(&cell->vl_servers_lock);
+ vc->server_list = afs_get_vlserverlist(
+ rcu_dereference_protected(cell->vl_servers,
+ lockdep_is_held(&cell->vl_servers_lock)));
+ read_unlock(&cell->vl_servers_lock);
+ if (!vc->server_list || !vc->server_list->nr_servers)
+ return false;
+
+ vc->untried = (1UL << vc->server_list->nr_servers) - 1;
+ vc->index = -1;
+ return true;
+}
+
+/*
+ * Select the vlserver to use. May be called multiple times to rotate
+ * through the vlservers.
+ */
+bool afs_select_vlserver(struct afs_vl_cursor *vc)
+{
+ struct afs_addr_list *alist;
+ struct afs_vlserver *vlserver;
+ u32 rtt;
+ int error = vc->ac.error, abort_code, i;
+
+ _enter("%lx[%d],%lx[%d],%d,%d",
+ vc->untried, vc->index,
+ vc->ac.tried, vc->ac.index,
+ error, vc->ac.abort_code);
+
+ if (vc->flags & AFS_VL_CURSOR_STOP) {
+ _leave(" = f [stopped]");
+ return false;
+ }
+
+ vc->nr_iterations++;
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (error) {
+ case SHRT_MAX:
+ goto start;
+
+ default:
+ case 0:
+ /* Success or local failure. Stop. */
+ vc->error = error;
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ _leave(" = f [okay/local %d]", vc->ac.error);
+ return false;
+
+ case -ECONNABORTED:
+ /* The far side rejected the operation on some grounds. This
+ * might involve the server being busy or the volume having been moved.
+ */
+ switch (vc->ac.abort_code) {
+ case AFSVL_IO:
+ case AFSVL_BADVOLOPER:
+ case AFSVL_NOMEM:
+ /* The server went weird. */
+ vc->error = -EREMOTEIO;
+ //write_lock(&vc->cell->vl_servers_lock);
+ //vc->server_list->weird_mask |= 1 << vc->index;
+ //write_unlock(&vc->cell->vl_servers_lock);
+ goto next_server;
+
+ default:
+ vc->error = afs_abort_to_error(vc->ac.abort_code);
+ goto failed;
+ }
+
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ETIME:
+ _debug("no conn %d", error);
+ vc->error = error;
+ goto iterate_address;
+
+ case -ECONNRESET:
+ _debug("call reset");
+ vc->error = error;
+ vc->flags |= AFS_VL_CURSOR_RETRY;
+ goto next_server;
+ }
+
+restart_from_beginning:
+ _debug("restart");
+ afs_end_cursor(&vc->ac);
+ afs_put_vlserverlist(vc->cell->net, vc->server_list);
+ vc->server_list = NULL;
+ if (vc->flags & AFS_VL_CURSOR_RETRIED)
+ goto failed;
+ vc->flags |= AFS_VL_CURSOR_RETRIED;
+start:
+ _debug("start");
+
+ if (!afs_start_vl_iteration(vc))
+ goto failed;
+
+ error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
+ if (error < 0)
+ goto failed_set_error;
+
+pick_server:
+ _debug("pick [%lx]", vc->untried);
+
+ error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
+ if (error < 0)
+ goto failed_set_error;
+
+ /* Pick the untried server with the lowest RTT. */
+ vc->index = vc->server_list->preferred;
+ if (test_bit(vc->index, &vc->untried))
+ goto selected_server;
+
+ vc->index = -1;
+ rtt = U32_MAX;
+ for (i = 0; i < vc->server_list->nr_servers; i++) {
+ struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+ if (!test_bit(i, &vc->untried) || !s->probe.responded)
+ continue;
+ if (s->probe.rtt < rtt) {
+ vc->index = i;
+ rtt = s->probe.rtt;
+ }
+ }
+
+ if (vc->index == -1)
+ goto no_more_servers;
+
+selected_server:
+ _debug("use %d", vc->index);
+ __clear_bit(vc->index, &vc->untried);
+
+ /* We're starting on a different vlserver from the list. We need to
+ * check it, find its address list and probe its capabilities before we
+ * use it.
+ */
+ ASSERTCMP(vc->ac.alist, ==, NULL);
+ vlserver = vc->server_list->servers[vc->index].server;
+ vc->server = vlserver;
+
+ _debug("USING VLSERVER: %s", vlserver->name);
+
+ read_lock(&vlserver->lock);
+ alist = rcu_dereference_protected(vlserver->addresses,
+ lockdep_is_held(&vlserver->lock));
+ afs_get_addrlist(alist);
+ read_unlock(&vlserver->lock);
+
+ memset(&vc->ac, 0, sizeof(vc->ac));
+
+ if (!vc->ac.alist)
+ vc->ac.alist = alist;
+ else
+ afs_put_addrlist(alist);
+
+ vc->ac.index = -1;
+
+iterate_address:
+ ASSERT(vc->ac.alist);
+ /* Iterate over the current server's address list to try and find an
+ * address on which it will respond to us.
+ */
+ if (!afs_iterate_addresses(&vc->ac))
+ goto next_server;
+
+ _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+
+ _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
+ return true;
+
+next_server:
+ _debug("next");
+ afs_end_cursor(&vc->ac);
+ goto pick_server;
+
+no_more_servers:
+ /* That's all the servers poked to no good effect. Try again if some
+ * of them were busy.
+ */
+ if (vc->flags & AFS_VL_CURSOR_RETRY)
+ goto restart_from_beginning;
+
+ abort_code = 0;
+ error = -EDESTADDRREQ;
+ for (i = 0; i < vc->server_list->nr_servers; i++) {
+ struct afs_vlserver *s = vc->server_list->servers[i].server;
+ int probe_error = READ_ONCE(s->probe.error);
+
+ switch (probe_error) {
+ case 0:
+ continue;
+ default:
+ if (error == -ETIMEDOUT ||
+ error == -ETIME)
+ continue;
+ case -ETIMEDOUT:
+ case -ETIME:
+ if (error == -ENOMEM ||
+ error == -ENONET)
+ continue;
+ case -ENOMEM:
+ case -ENONET:
+ if (error == -ENETUNREACH)
+ continue;
+ case -ENETUNREACH:
+ if (error == -EHOSTUNREACH)
+ continue;
+ case -EHOSTUNREACH:
+ if (error == -ECONNREFUSED)
+ continue;
+ case -ECONNREFUSED:
+ if (error == -ECONNRESET)
+ continue;
+ case -ECONNRESET: /* Responded, but call expired. */
+ if (error == -ECONNABORTED)
+ continue;
+ case -ECONNABORTED:
+ abort_code = s->probe.abort_code;
+ error = probe_error;
+ continue;
+ }
+ }
+
+ if (error == -ECONNABORTED)
+ error = afs_abort_to_error(abort_code);
+
+failed_set_error:
+ vc->error = error;
+failed:
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ afs_end_cursor(&vc->ac);
+ _leave(" = f [failed %d]", vc->error);
+ return false;
+}
+
+/*
+ * Dump cursor state in the case of the error being EDESTADDRREQ.
+ */
+static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+{
+ static int count;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
+ return;
+ count++;
+
+ rcu_read_lock();
+ pr_notice("EDESTADDR occurred\n");
+ pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
+ vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
+
+ if (vc->server_list) {
+ const struct afs_vlserver_list *sl = vc->server_list;
+ pr_notice("VC: SL nr=%u ix=%u\n",
+ sl->nr_servers, sl->index);
+ for (i = 0; i < sl->nr_servers; i++) {
+ const struct afs_vlserver *s = sl->servers[i].server;
+ pr_notice("VC: server %s+%hu fl=%lx E=%hd\n",
+ s->name, s->port, s->flags, s->probe.error);
+ if (s->addresses) {
+ const struct afs_addr_list *a =
+ rcu_dereference(s->addresses);
+ pr_notice("VC: - nr=%u/%u/%u pf=%u\n",
+ a->nr_ipv4, a->nr_addrs, a->max_addrs,
+ a->preferred);
+ pr_notice("VC: - pr=%lx R=%lx F=%lx\n",
+ a->probed, a->responded, a->failed);
+ if (a == vc->ac.alist)
+ pr_notice("VC: - current\n");
+ }
+ }
+ }
+
+ pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+ vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
+ vc->ac.responded, vc->ac.nr_iterations);
+ rcu_read_unlock();
+}
+
+/*
+ * Tidy up a volume location server cursor and unlock the vnode.
+ */
+int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
+{
+ struct afs_net *net = vc->cell->net;
+
+ if (vc->error == -EDESTADDRREQ ||
+ vc->error == -ENETUNREACH ||
+ vc->error == -EHOSTUNREACH)
+ afs_vl_dump_edestaddrreq(vc);
+
+ afs_end_cursor(&vc->ac);
+ afs_put_vlserverlist(net, vc->server_list);
+
+ if (vc->error == -ECONNABORTED)
+ vc->error = afs_abort_to_error(vc->ac.abort_code);
+
+ return vc->error;
+}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index c3b740813fc7..c3d9e5a5f67e 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -128,14 +128,13 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
* Dispatch a get volume entry by name or ID operation (uuid variant). If the
* volname is a decimal number then it's a volume ID not a volume name.
*/
-struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
- struct afs_addr_cursor *ac,
- struct key *key,
+struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
const char *volname,
int volnamesz)
{
struct afs_vldb_entry *entry;
struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
size_t reqsz, padsz;
__be32 *bp;
@@ -155,7 +154,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
- call->key = key;
+ call->key = vc->key;
call->reply[0] = entry;
call->ret_reply0 = true;
@@ -168,7 +167,7 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
memset((void *)bp + volnamesz, 0, padsz);
trace_afs_make_vl_call(call);
- return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false);
+ return (struct afs_vldb_entry *)afs_make_call(&vc->ac, call, GFP_KERNEL, false);
}
/*
@@ -187,19 +186,18 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
u32 uniquifier, nentries, count;
int i, ret;
- _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+ _enter("{%u,%zu/%u}",
+ call->unmarshall, iov_iter_count(call->_iter), call->count);
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_buf(call,
+ sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32));
call->unmarshall++;
/* Extract the returned uuid, uniquifier, nentries and blkaddrs size */
case 1:
- ret = afs_extract_data(call, call->buffer,
- sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -216,28 +214,28 @@ again:
call->reply[0] = alist;
call->count = count;
call->count2 = nentries;
- call->offset = 0;
call->unmarshall++;
+ more_entries:
+ count = min(call->count, 4U);
+ afs_extract_to_buf(call, count * sizeof(__be32));
+
/* Extract entries */
case 2:
- count = min(call->count, 4U);
- ret = afs_extract_data(call, call->buffer,
- count * sizeof(__be32),
- call->count > 4);
+ ret = afs_extract_data(call, call->count > 4);
if (ret < 0)
return ret;
alist = call->reply[0];
bp = call->buffer;
+ count = min(call->count, 4U);
for (i = 0; i < count; i++)
if (alist->nr_addrs < call->count2)
afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
call->count -= count;
if (call->count > 0)
- goto again;
- call->offset = 0;
+ goto more_entries;
call->unmarshall++;
break;
}
@@ -267,14 +265,13 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
* Dispatch an operation to get the addresses for a server, where the server is
* nominated by UUID.
*/
-struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
- struct afs_addr_cursor *ac,
- struct key *key,
+struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_ListAddrByAttributes__xdr *r;
const struct afs_uuid *u = (const struct afs_uuid *)uuid;
struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
__be32 *bp;
int i;
@@ -286,7 +283,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
if (!call)
return ERR_PTR(-ENOMEM);
- call->key = key;
+ call->key = vc->key;
call->reply[0] = NULL;
call->ret_reply0 = true;
@@ -307,7 +304,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
r->uuid.node[i] = htonl(u->node[i]);
trace_afs_make_vl_call(call);
- return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+ return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false);
}
/*
@@ -318,54 +315,51 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
u32 count;
int ret;
- _enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+ _enter("{%u,%zu/%u}",
+ call->unmarshall, iov_iter_count(call->_iter), call->count);
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_tmp(call);
call->unmarshall++;
/* Extract the capabilities word count */
case 1:
- ret = afs_extract_data(call, &call->tmp,
- 1 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
count = ntohl(call->tmp);
-
call->count = count;
call->count2 = count;
- call->offset = 0;
+
call->unmarshall++;
+ afs_extract_discard(call, count * sizeof(__be32));
/* Extract capabilities words */
case 2:
- count = min(call->count, 16U);
- ret = afs_extract_data(call, call->buffer,
- count * sizeof(__be32),
- call->count > 16);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
/* TODO: Examine capabilities */
- call->count -= count;
- if (call->count > 0)
- goto again;
- call->offset = 0;
call->unmarshall++;
break;
}
- call->reply[0] = (void *)(unsigned long)call->service_id;
-
_leave(" = 0 [done]");
return 0;
}
+static void afs_destroy_vl_get_capabilities(struct afs_call *call)
+{
+ struct afs_vlserver *server = call->reply[0];
+
+ afs_put_vlserver(call->net, server);
+ afs_flat_call_destructor(call);
+}
+
/*
* VL.GetCapabilities operation type
*/
@@ -373,11 +367,12 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
.name = "VL.GetCapabilities",
.op = afs_VL_GetCapabilities,
.deliver = afs_deliver_vl_get_capabilities,
- .destructor = afs_flat_call_destructor,
+ .done = afs_vlserver_probe_result,
+ .destructor = afs_destroy_vl_get_capabilities,
};
/*
- * Probe a fileserver for the capabilities that it supports. This can
+ * Probe a volume server for the capabilities that it supports. This can
* return up to 196 words.
*
* We use this to probe for service upgrade to determine what the server at the
@@ -385,7 +380,10 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
*/
int afs_vl_get_capabilities(struct afs_net *net,
struct afs_addr_cursor *ac,
- struct key *key)
+ struct key *key,
+ struct afs_vlserver *server,
+ unsigned int server_index,
+ bool async)
{
struct afs_call *call;
__be32 *bp;
@@ -397,9 +395,10 @@ int afs_vl_get_capabilities(struct afs_net *net,
return -ENOMEM;
call->key = key;
- call->upgrade = true; /* Let's see if this is a YFS server */
- call->reply[0] = (void *)VLGETCAPABILITIES;
- call->ret_reply0 = true;
+ call->reply[0] = afs_get_vlserver(server);
+ call->reply[1] = (void *)(long)server_index;
+ call->upgrade = true;
+ call->want_reply_time = true;
/* marshall the parameters */
bp = call->request;
@@ -407,7 +406,7 @@ int afs_vl_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- return afs_make_call(ac, call, GFP_KERNEL, false);
+ return afs_make_call(ac, call, GFP_KERNEL, async);
}
/*
@@ -426,22 +425,19 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
u32 uniquifier, size;
int ret;
- _enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2);
+ _enter("{%u,%zu,%u}",
+ call->unmarshall, iov_iter_count(call->_iter), call->count2);
-again:
switch (call->unmarshall) {
case 0:
- call->offset = 0;
+ afs_extract_to_buf(call, sizeof(uuid_t) + 3 * sizeof(__be32));
call->unmarshall = 1;
/* Extract the returned uuid, uniquifier, fsEndpoints count and
* either the first fsEndpoint type or the volEndpoints
* count if there are no fsEndpoints. */
case 1:
- ret = afs_extract_data(call, call->buffer,
- sizeof(uuid_t) +
- 3 * sizeof(__be32),
- true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -451,22 +447,19 @@ again:
call->count2 = ntohl(*bp); /* Type or next count */
if (call->count > YFS_MAXENDPOINTS)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt_num);
alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
call->reply[0] = alist;
- call->offset = 0;
if (call->count == 0)
goto extract_volendpoints;
- call->unmarshall = 2;
-
- /* Extract fsEndpoints[] entries */
- case 2:
+ next_fsendpoint:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
size = sizeof(__be32) * (1 + 1 + 1);
@@ -475,11 +468,17 @@ again:
size = sizeof(__be32) * (1 + 4 + 1);
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt_type);
}
size += sizeof(__be32);
- ret = afs_extract_data(call, call->buffer, size, true);
+ afs_extract_to_buf(call, size);
+ call->unmarshall = 2;
+
+ /* Extract fsEndpoints[] entries */
+ case 2:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -488,18 +487,21 @@ again:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
if (ntohl(bp[0]) != sizeof(__be32) * 2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt4_len);
afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt6_len);
afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
bp += 6;
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_fsendpt_type);
}
/* Got either the type of the next entry or the count of
@@ -507,10 +509,9 @@ again:
*/
call->count2 = ntohl(*bp++);
- call->offset = 0;
call->count--;
if (call->count > 0)
- goto again;
+ goto next_fsendpoint;
extract_volendpoints:
/* Extract the list of volEndpoints. */
@@ -518,8 +519,10 @@ again:
if (!call->count)
goto end;
if (call->count > YFS_MAXENDPOINTS)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt_type);
+ afs_extract_to_buf(call, 1 * sizeof(__be32));
call->unmarshall = 3;
/* Extract the type of volEndpoints[0]. Normally we would
@@ -527,17 +530,14 @@ again:
* data of the current one, but this is the first...
*/
case 3:
- ret = afs_extract_data(call, call->buffer, sizeof(__be32), true);
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
bp = call->buffer;
- call->count2 = ntohl(*bp++);
- call->offset = 0;
- call->unmarshall = 4;
- /* Extract volEndpoints[] entries */
- case 4:
+ next_volendpoint:
+ call->count2 = ntohl(*bp++);
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
size = sizeof(__be32) * (1 + 1 + 1);
@@ -546,12 +546,18 @@ again:
size = sizeof(__be32) * (1 + 4 + 1);
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt_type);
}
if (call->count > 1)
- size += sizeof(__be32);
- ret = afs_extract_data(call, call->buffer, size, true);
+ size += sizeof(__be32); /* Get next type too */
+ afs_extract_to_buf(call, size);
+ call->unmarshall = 4;
+
+ /* Extract volEndpoints[] entries */
+ case 4:
+ ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
@@ -559,34 +565,35 @@ again:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
if (ntohl(bp[0]) != sizeof(__be32) * 2)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt4_len);
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt6_len);
bp += 6;
break;
default:
- return afs_protocol_error(call, -EBADMSG);
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_yvl_vlendpt_type);
}
/* Got either the type of the next entry or the count of
* volEndpoints if no more fsEndpoints.
*/
- call->offset = 0;
call->count--;
- if (call->count > 0) {
- call->count2 = ntohl(*bp++);
- goto again;
- }
+ if (call->count > 0)
+ goto next_volendpoint;
end:
+ afs_extract_discard(call, 0);
call->unmarshall = 5;
/* Done */
case 5:
- ret = afs_extract_data(call, call->buffer, 0, false);
+ ret = afs_extract_data(call, false);
if (ret < 0)
return ret;
call->unmarshall = 6;
@@ -596,11 +603,6 @@ again:
}
alist = call->reply[0];
-
- /* Start with IPv6 if available. */
- if (alist->nr_ipv4 < alist->nr_addrs)
- alist->index = alist->nr_ipv4;
-
_leave(" = 0 [done]");
return 0;
}
@@ -619,12 +621,11 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
* Dispatch an operation to get the addresses for a server, where the server is
* nominated by UUID.
*/
-struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
- struct afs_addr_cursor *ac,
- struct key *key,
+struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
__be32 *bp;
_enter("");
@@ -635,7 +636,7 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
if (!call)
return ERR_PTR(-ENOMEM);
- call->key = key;
+ call->key = vc->key;
call->reply[0] = NULL;
call->ret_reply0 = true;
@@ -646,5 +647,5 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
trace_afs_make_vl_call(call);
- return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+ return (struct afs_addr_list *)afs_make_call(&vc->ac, call, GFP_KERNEL, false);
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 3037bd01f617..00975ed3640f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -74,55 +74,19 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
const char *volname,
size_t volnamesz)
{
- struct afs_addr_cursor ac;
- struct afs_vldb_entry *vldb;
+ struct afs_vldb_entry *vldb = ERR_PTR(-EDESTADDRREQ);
+ struct afs_vl_cursor vc;
int ret;
- ret = afs_set_vl_cursor(&ac, cell);
- if (ret < 0)
- return ERR_PTR(ret);
-
- while (afs_iterate_addresses(&ac)) {
- if (!test_bit(ac.index, &ac.alist->probed)) {
- ret = afs_vl_get_capabilities(cell->net, &ac, key);
- switch (ret) {
- case VL_SERVICE:
- clear_bit(ac.index, &ac.alist->yfs);
- set_bit(ac.index, &ac.alist->probed);
- ac.addr->srx_service = ret;
- break;
- case YFS_VL_SERVICE:
- set_bit(ac.index, &ac.alist->yfs);
- set_bit(ac.index, &ac.alist->probed);
- ac.addr->srx_service = ret;
- break;
- }
- }
-
- vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key,
- volname, volnamesz);
- switch (ac.error) {
- case 0:
- afs_end_cursor(&ac);
- return vldb;
- case -ECONNABORTED:
- ac.error = afs_abort_to_error(ac.abort_code);
- goto error;
- case -ENOMEM:
- case -ENONET:
- goto error;
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- break;
- default:
- ac.error = -EIO;
- goto error;
- }
+ if (!afs_begin_vlserver_operation(&vc, cell, key))
+ return ERR_PTR(-ERESTARTSYS);
+
+ while (afs_select_vlserver(&vc)) {
+ vldb = afs_vl_get_entry_by_name_u(&vc, volname, volnamesz);
}
-error:
- return ERR_PTR(afs_end_cursor(&ac));
+ ret = afs_end_vlserver_operation(&vc);
+ return ret < 0 ? ERR_PTR(ret) : vldb;
}
/*
@@ -270,7 +234,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
/* We look up an ID by passing it as a decimal string in the
* operation's name parameter.
*/
- idsz = sprintf(idbuf, "%u", volume->vid);
+ idsz = sprintf(idbuf, "%llu", volume->vid);
vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
if (IS_ERR(vldb)) {
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 19c04caf3c01..72efcfcf9f95 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -33,10 +33,21 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
loff_t pos, unsigned int len, struct page *page)
{
struct afs_read *req;
+ size_t p;
+ void *data;
int ret;
_enter(",,%llu", (unsigned long long)pos);
+ if (pos >= vnode->vfs_inode.i_size) {
+ p = pos & ~PAGE_MASK;
+ ASSERTCMP(p + len, <=, PAGE_SIZE);
+ data = kmap(page);
+ memset(data + p, 0, len);
+ kunmap(page);
+ return 0;
+ }
+
req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
GFP_KERNEL);
if (!req)
@@ -81,7 +92,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
- _enter("{%x:%u},{%lx},%u,%u",
+ _enter("{%llx:%llu},{%lx},%u,%u",
vnode->fid.vid, vnode->fid.vnode, index, from, to);
/* We want to store information about how much of a page is altered in
@@ -181,7 +192,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
loff_t i_size, maybe_i_size;
int ret;
- _enter("{%x:%u},{%lx}",
+ _enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, page->index);
maybe_i_size = pos + copied;
@@ -230,7 +241,7 @@ static void afs_kill_pages(struct address_space *mapping,
struct pagevec pv;
unsigned count, loop;
- _enter("{%x:%u},%lx-%lx",
+ _enter("{%llx:%llu},%lx-%lx",
vnode->fid.vid, vnode->fid.vnode, first, last);
pagevec_init(&pv);
@@ -272,7 +283,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
struct pagevec pv;
unsigned count, loop;
- _enter("{%x:%u},%lx-%lx",
+ _enter("{%llx:%llu},%lx-%lx",
vnode->fid.vid, vnode->fid.vnode, first, last);
pagevec_init(&pv);
@@ -314,7 +325,7 @@ static int afs_store_data(struct address_space *mapping,
struct list_head *p;
int ret = -ENOKEY, ret2;
- _enter("%s{%x:%u.%u},%lx,%lx,%x,%x",
+ _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -533,6 +544,7 @@ no_more:
case -ENOENT:
case -ENOMEDIUM:
case -ENXIO:
+ trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
afs_kill_pages(mapping, first, last);
mapping_set_error(mapping, ret);
break;
@@ -675,7 +687,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
unsigned count, loop;
pgoff_t first = call->first, last = call->last;
- _enter("{%x:%u},{%lx-%lx}",
+ _enter("{%llx:%llu},{%lx-%lx}",
vnode->fid.vid, vnode->fid.vnode, first, last);
pagevec_init(&pv);
@@ -714,7 +726,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t result;
size_t count = iov_iter_count(from);
- _enter("{%x.%u},{%zu},",
+ _enter("{%llx:%llu},{%zu},",
vnode->fid.vid, vnode->fid.vnode, count);
if (IS_SWAPFILE(&vnode->vfs_inode)) {
@@ -742,7 +754,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
- _enter("{%x:%u},{n=%pD},%d",
+ _enter("{%llx:%llu},{n=%pD},%d",
vnode->fid.vid, vnode->fid.vnode, file,
datasync);
@@ -760,7 +772,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
struct afs_vnode *vnode = AFS_FS_I(inode);
unsigned long priv;
- _enter("{{%x:%u}},{%lx}",
+ _enter("{{%llx:%llu}},{%lx}",
vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
sb_start_pagefault(inode->i_sb);
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index cfcc674e64a5..a2cdf25573e2 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -72,7 +72,7 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler,
char text[8 + 1 + 8 + 1 + 8 + 1];
size_t len;
- len = sprintf(text, "%x:%x:%x",
+ len = sprintf(text, "%llx:%llx:%x",
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
if (size == 0)
return len;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
new file mode 100644
index 000000000000..12658c1363ae
--- /dev/null
+++ b/fs/afs/yfsclient.c
@@ -0,0 +1,2184 @@
+/* YFS File Server client stubs
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/circ_buf.h>
+#include <linux/iversion.h>
+#include "internal.h"
+#include "afs_fs.h"
+#include "xdr_fs.h"
+#include "protocol_yfs.h"
+
+static const struct afs_fid afs_zero_fid;
+
+static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
+{
+ call->cbi = afs_get_cb_interest(cbi);
+}
+
+#define xdr_size(x) (sizeof(*x) / sizeof(__be32))
+
+static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid)
+{
+ const struct yfs_xdr_YFSFid *x = (const void *)*_bp;
+
+ fid->vid = xdr_to_u64(x->volume);
+ fid->vnode = xdr_to_u64(x->vnode.lo);
+ fid->vnode_hi = ntohl(x->vnode.hi);
+ fid->unique = ntohl(x->vnode.unique);
+ *_bp += xdr_size(x);
+}
+
+static __be32 *xdr_encode_u32(__be32 *bp, u32 n)
+{
+ *bp++ = htonl(n);
+ return bp;
+}
+
+static __be32 *xdr_encode_u64(__be32 *bp, u64 n)
+{
+ struct yfs_xdr_u64 *x = (void *)bp;
+
+ *x = u64_to_xdr(n);
+ return bp + xdr_size(x);
+}
+
+static __be32 *xdr_encode_YFSFid(__be32 *bp, struct afs_fid *fid)
+{
+ struct yfs_xdr_YFSFid *x = (void *)bp;
+
+ x->volume = u64_to_xdr(fid->vid);
+ x->vnode.lo = u64_to_xdr(fid->vnode);
+ x->vnode.hi = htonl(fid->vnode_hi);
+ x->vnode.unique = htonl(fid->unique);
+ return bp + xdr_size(x);
+}
+
+static size_t xdr_strlen(unsigned int len)
+{
+ return sizeof(__be32) + round_up(len, sizeof(__be32));
+}
+
+static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len)
+{
+ bp = xdr_encode_u32(bp, len);
+ bp = memcpy(bp, p, len);
+ if (len & 3) {
+ unsigned int pad = 4 - (len & 3);
+
+ memset((u8 *)bp + len, 0, pad);
+ len += pad;
+ }
+
+ return bp + len / sizeof(__be32);
+}
+
+static s64 linux_to_yfs_time(const struct timespec64 *t)
+{
+ /* Convert to 100ns intervals. */
+ return (u64)t->tv_sec * 10000000 + t->tv_nsec/100;
+}
+
+static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode)
+{
+ struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+
+ x->mask = htonl(AFS_SET_MODE);
+ x->mode = htonl(mode & S_IALLUGO);
+ x->mtime_client = u64_to_xdr(0);
+ x->owner = u64_to_xdr(0);
+ x->group = u64_to_xdr(0);
+ return bp + xdr_size(x);
+}
+
+static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t)
+{
+ struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ s64 mtime = linux_to_yfs_time(t);
+
+ x->mask = htonl(AFS_SET_MTIME);
+ x->mode = htonl(0);
+ x->mtime_client = u64_to_xdr(mtime);
+ x->owner = u64_to_xdr(0);
+ x->group = u64_to_xdr(0);
+ return bp + xdr_size(x);
+}
+
+/*
+ * Convert a signed 100ns-resolution 64-bit time into a timespec.
+ */
+static struct timespec64 yfs_time_to_linux(s64 t)
+{
+ struct timespec64 ts;
+ u64 abs_t;
+
+ /*
+ * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+ * the alternative, do_div, does not work with negative numbers so have
+ * to special case them
+ */
+ if (t < 0) {
+ abs_t = -t;
+ ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100);
+ ts.tv_nsec = -ts.tv_nsec;
+ ts.tv_sec = -abs_t;
+ } else {
+ abs_t = t;
+ ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100;
+ ts.tv_sec = abs_t;
+ }
+
+ return ts;
+}
+
+static struct timespec64 xdr_to_time(const struct yfs_xdr_u64 xdr)
+{
+ s64 t = xdr_to_u64(xdr);
+
+ return yfs_time_to_linux(t);
+}
+
+static void yfs_check_req(struct afs_call *call, __be32 *bp)
+{
+ size_t len = (void *)bp - call->request;
+
+ if (len > call->request_size)
+ pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n",
+ call->type->name, len, call->request_size);
+ else if (len < call->request_size)
+ pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n",
+ call->type->name, len, call->request_size);
+}
+
+/*
+ * Dump a bad file status record.
+ */
+static void xdr_dump_bad(const __be32 *bp)
+{
+ __be32 x[4];
+ int i;
+
+ pr_notice("YFS XDR: Bad status record\n");
+ for (i = 0; i < 5 * 4 * 4; i += 16) {
+ memcpy(x, bp, 16);
+ bp += 4;
+ pr_notice("%03x: %08x %08x %08x %08x\n",
+ i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
+ }
+
+ memcpy(x, bp, 4);
+ pr_notice("0x50: %08x\n", ntohl(x[0]));
+}
+
+/*
+ * Decode a YFSFetchStatus block
+ */
+static int xdr_decode_YFSFetchStatus(struct afs_call *call,
+ const __be32 **_bp,
+ struct afs_file_status *status,
+ struct afs_vnode *vnode,
+ const afs_dataversion_t *expected_version,
+ struct afs_read *read_req)
+{
+ const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
+ u32 type;
+ u8 flags = 0;
+
+ status->abort_code = ntohl(xdr->abort_code);
+ if (status->abort_code != 0) {
+ if (vnode && status->abort_code == VNOVNODE) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ status->nlink = 0;
+ __afs_break_callback(vnode);
+ }
+ return 0;
+ }
+
+ type = ntohl(xdr->type);
+ switch (type) {
+ case AFS_FTYPE_FILE:
+ case AFS_FTYPE_DIR:
+ case AFS_FTYPE_SYMLINK:
+ if (type != status->type &&
+ vnode &&
+ !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+ pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ status->type, type);
+ goto bad;
+ }
+ status->type = type;
+ break;
+ default:
+ goto bad;
+ }
+
+#define EXTRACT_M4(FIELD) \
+ do { \
+ u32 x = ntohl(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_META_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
+
+#define EXTRACT_M8(FIELD) \
+ do { \
+ u64 x = xdr_to_u64(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_META_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
+
+#define EXTRACT_D8(FIELD) \
+ do { \
+ u64 x = xdr_to_u64(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_DATA_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
+
+ EXTRACT_M4(nlink);
+ EXTRACT_D8(size);
+ EXTRACT_D8(data_version);
+ EXTRACT_M8(author);
+ EXTRACT_M8(owner);
+ EXTRACT_M8(group);
+ EXTRACT_M4(mode);
+ EXTRACT_M4(caller_access); /* call ticket dependent */
+ EXTRACT_M4(anon_access);
+
+ status->mtime_client = xdr_to_time(xdr->mtime_client);
+ status->mtime_server = xdr_to_time(xdr->mtime_server);
+ status->lock_count = ntohl(xdr->lock_count);
+
+ if (read_req) {
+ read_req->data_version = status->data_version;
+ read_req->file_size = status->size;
+ }
+
+ *_bp += xdr_size(xdr);
+
+ if (vnode) {
+ if (test_bit(AFS_VNODE_UNSET, &vnode->flags))
+ flags |= AFS_VNODE_NOT_YET_SET;
+ afs_update_inode_from_status(vnode, status, expected_version,
+ flags);
+ }
+
+ return 0;
+
+bad:
+ xdr_dump_bad(*_bp);
+ return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+}
+
+/*
+ * Decode the file status. We need to lock the target vnode if we're going to
+ * update its status so that stat() sees the attributes update atomically.
+ */
+static int yfs_decode_status(struct afs_call *call,
+ const __be32 **_bp,
+ struct afs_file_status *status,
+ struct afs_vnode *vnode,
+ const afs_dataversion_t *expected_version,
+ struct afs_read *read_req)
+{
+ int ret;
+
+ if (!vnode)
+ return xdr_decode_YFSFetchStatus(call, _bp, status, vnode,
+ expected_version, read_req);
+
+ write_seqlock(&vnode->cb_lock);
+ ret = xdr_decode_YFSFetchStatus(call, _bp, status, vnode,
+ expected_version, read_req);
+ write_sequnlock(&vnode->cb_lock);
+ return ret;
+}
+
+/*
+ * Decode a YFSCallBack block
+ */
+static void xdr_decode_YFSCallBack(struct afs_call *call,
+ struct afs_vnode *vnode,
+ const __be32 **_bp)
+{
+ struct yfs_xdr_YFSCallBack *xdr = (void *)*_bp;
+ struct afs_cb_interest *old, *cbi = call->cbi;
+ u64 cb_expiry;
+
+ write_seqlock(&vnode->cb_lock);
+
+ if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) {
+ cb_expiry = xdr_to_u64(xdr->expiration_time);
+ do_div(cb_expiry, 10 * 1000 * 1000);
+ vnode->cb_version = ntohl(xdr->version);
+ vnode->cb_type = ntohl(xdr->type);
+ vnode->cb_expires_at = cb_expiry + ktime_get_real_seconds();
+ old = vnode->cb_interest;
+ if (old != call->cbi) {
+ vnode->cb_interest = cbi;
+ cbi = old;
+ }
+ set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ }
+
+ write_sequnlock(&vnode->cb_lock);
+ call->cbi = cbi;
+ *_bp += xdr_size(xdr);
+}
+
+static void xdr_decode_YFSCallBack_raw(const __be32 **_bp,
+ struct afs_callback *cb)
+{
+ struct yfs_xdr_YFSCallBack *x = (void *)*_bp;
+ u64 cb_expiry;
+
+ cb_expiry = xdr_to_u64(x->expiration_time);
+ do_div(cb_expiry, 10 * 1000 * 1000);
+ cb->version = ntohl(x->version);
+ cb->type = ntohl(x->type);
+ cb->expires_at = cb_expiry + ktime_get_real_seconds();
+
+ *_bp += xdr_size(x);
+}
+
+/*
+ * Decode a YFSVolSync block
+ */
+static void xdr_decode_YFSVolSync(const __be32 **_bp,
+ struct afs_volsync *volsync)
+{
+ struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
+ u64 creation;
+
+ if (volsync) {
+ creation = xdr_to_u64(x->vol_creation_date);
+ do_div(creation, 10 * 1000 * 1000);
+ volsync->creation = creation;
+ }
+
+ *_bp += xdr_size(x);
+}
+
+/*
+ * Encode the requested attributes into a YFSStoreStatus block
+ */
+static __be32 *xdr_encode_YFS_StoreStatus(__be32 *bp, struct iattr *attr)
+{
+ struct yfs_xdr_YFSStoreStatus *x = (void *)bp;
+ s64 mtime = 0, owner = 0, group = 0;
+ u32 mask = 0, mode = 0;
+
+ mask = 0;
+ if (attr->ia_valid & ATTR_MTIME) {
+ mask |= AFS_SET_MTIME;
+ mtime = linux_to_yfs_time(&attr->ia_mtime);
+ }
+
+ if (attr->ia_valid & ATTR_UID) {
+ mask |= AFS_SET_OWNER;
+ owner = from_kuid(&init_user_ns, attr->ia_uid);
+ }
+
+ if (attr->ia_valid & ATTR_GID) {
+ mask |= AFS_SET_GROUP;
+ group = from_kgid(&init_user_ns, attr->ia_gid);
+ }
+
+ if (attr->ia_valid & ATTR_MODE) {
+ mask |= AFS_SET_MODE;
+ mode = attr->ia_mode & S_IALLUGO;
+ }
+
+ x->mask = htonl(mask);
+ x->mode = htonl(mode);
+ x->mtime_client = u64_to_xdr(mtime);
+ x->owner = u64_to_xdr(owner);
+ x->group = u64_to_xdr(group);
+ return bp + xdr_size(x);
+}
+
+/*
+ * Decode a YFSFetchVolumeStatus block.
+ */
+static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
+ struct afs_volume_status *vs)
+{
+ const struct yfs_xdr_YFSFetchVolumeStatus *x = (const void *)*_bp;
+ u32 flags;
+
+ vs->vid = xdr_to_u64(x->vid);
+ vs->parent_id = xdr_to_u64(x->parent_id);
+ flags = ntohl(x->flags);
+ vs->online = flags & yfs_FVSOnline;
+ vs->in_service = flags & yfs_FVSInservice;
+ vs->blessed = flags & yfs_FVSBlessed;
+ vs->needs_salvage = flags & yfs_FVSNeedsSalvage;
+ vs->type = ntohl(x->type);
+ vs->min_quota = 0;
+ vs->max_quota = xdr_to_u64(x->max_quota);
+ vs->blocks_in_use = xdr_to_u64(x->blocks_in_use);
+ vs->part_blocks_avail = xdr_to_u64(x->part_blocks_avail);
+ vs->part_max_blocks = xdr_to_u64(x->part_max_blocks);
+ vs->vol_copy_date = xdr_to_u64(x->vol_copy_date);
+ vs->vol_backup_date = xdr_to_u64(x->vol_backup_date);
+ *_bp += sizeof(*x) / sizeof(__be32);
+}
+
+/*
+ * deliver reply data to an FS.FetchStatus
+ */
+static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack(call, vnode, &bp);
+ xdr_decode_YFSVolSync(&bp, call->reply[1]);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.FetchStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
+ .name = "YFS.FetchStatus(vnode)",
+ .op = yfs_FS_FetchStatus,
+ .deliver = yfs_deliver_fs_fetch_status_vnode,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a file.
+ */
+int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync,
+ bool new_inode)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus_vnode,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = volsync;
+ call->expected_version = new_inode ? 1 : vnode->status.data_version;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an YFS.FetchData64.
+ */
+static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ struct afs_read *req = call->reply[2];
+ const __be32 *bp;
+ unsigned int size;
+ int ret;
+
+ _enter("{%u,%zu/%llu}",
+ call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+
+ switch (call->unmarshall) {
+ case 0:
+ req->actual_len = 0;
+ req->index = 0;
+ req->offset = req->pos & (PAGE_SIZE - 1);
+ afs_extract_to_tmp64(call);
+ call->unmarshall++;
+
+ /* extract the returned data length */
+ case 1:
+ _debug("extract data length");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ req->actual_len = be64_to_cpu(call->tmp64);
+ _debug("DATA length: %llu", req->actual_len);
+ req->remain = min(req->len, req->actual_len);
+ if (req->remain == 0)
+ goto no_more_data;
+
+ call->unmarshall++;
+
+ begin_page:
+ ASSERTCMP(req->index, <, req->nr_pages);
+ if (req->remain > PAGE_SIZE - req->offset)
+ size = PAGE_SIZE - req->offset;
+ else
+ size = req->remain;
+ call->bvec[0].bv_len = size;
+ call->bvec[0].bv_offset = req->offset;
+ call->bvec[0].bv_page = req->pages[req->index];
+ iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ ASSERTCMP(size, <=, PAGE_SIZE);
+
+ /* extract the returned data */
+ case 2:
+ _debug("extract data %zu/%llu",
+ iov_iter_count(&call->iter), req->remain);
+
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+ req->remain -= call->bvec[0].bv_len;
+ req->offset += call->bvec[0].bv_len;
+ ASSERTCMP(req->offset, <=, PAGE_SIZE);
+ if (req->offset == PAGE_SIZE) {
+ req->offset = 0;
+ if (req->page_done)
+ req->page_done(call, req);
+ req->index++;
+ if (req->remain > 0)
+ goto begin_page;
+ }
+
+ ASSERTCMP(req->remain, ==, 0);
+ if (req->actual_len <= req->len)
+ goto no_more_data;
+
+ /* Discard any excess data the server gave us */
+ iov_iter_discard(&call->iter, READ, req->actual_len - req->len);
+ call->unmarshall = 3;
+ case 3:
+ _debug("extract discard %zu/%llu",
+ iov_iter_count(&call->iter), req->actual_len - req->len);
+
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ no_more_data:
+ call->unmarshall = 4;
+ afs_extract_to_buf(call,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+
+ /* extract the metadata */
+ case 4:
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &vnode->status.data_version, req);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack(call, vnode, &bp);
+ xdr_decode_YFSVolSync(&bp, call->reply[1]);
+
+ call->unmarshall++;
+
+ case 5:
+ break;
+ }
+
+ for (; req->index < req->nr_pages; req->index++) {
+ if (req->offset < PAGE_SIZE)
+ zero_user_segment(req->pages[req->index],
+ req->offset, PAGE_SIZE);
+ if (req->page_done)
+ req->page_done(call, req);
+ req->offset = 0;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+static void yfs_fetch_data_destructor(struct afs_call *call)
+{
+ struct afs_read *req = call->reply[2];
+
+ afs_put_read(req);
+ afs_flat_call_destructor(call);
+}
+
+/*
+ * YFS.FetchData64 operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchData64 = {
+ .name = "YFS.FetchData64",
+ .op = yfs_FS_FetchData64,
+ .deliver = yfs_deliver_fs_fetch_data64,
+ .destructor = yfs_fetch_data_destructor,
+};
+
+/*
+ * Fetch data from a file.
+ */
+int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},%llx,%llx",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode,
+ req->pos, req->len);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSFetchData64,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_u64) * 2,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = NULL; /* volsync */
+ call->reply[2] = req;
+ call->expected_version = vnode->status.data_version;
+ call->want_reply_time = true;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSFETCHDATA64);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_u64(bp, req->pos);
+ bp = xdr_encode_u64(bp, req->len);
+ yfs_check_req(call, bp);
+
+ refcount_inc(&req->usage);
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data for YFS.CreateFile or YFS.MakeDir.
+ */
+static int yfs_deliver_fs_create_vnode(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_YFSFid(&bp, call->reply[1]);
+ ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack_raw(&bp, call->reply[3]);
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateFile = {
+ .name = "YFS.CreateFile",
+ .op = yfs_FS_CreateFile,
+ .deliver = yfs_deliver_fs_create_vnode,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Create a file.
+ */
+int yfs_fs_create_file(struct afs_fs_cursor *fc,
+ const char *name,
+ umode_t mode,
+ u64 current_data_version,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_callback *newcb)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ size_t namesz, reqsz, rplsz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ reqsz = (sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ sizeof(struct yfs_xdr_YFSStoreStatus) +
+ sizeof(__be32));
+ rplsz = (sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+
+ call = afs_alloc_flat_call(net, &afs_RXFSCreateFile, reqsz, rplsz);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = newfid;
+ call->reply[2] = newstatus;
+ call->reply[3] = newcb;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSCREATEFILE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
+ bp = xdr_encode_u32(bp, 0); /* ViceLockType */
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+static const struct afs_call_type yfs_RXFSMakeDir = {
+ .name = "YFS.MakeDir",
+ .op = yfs_FS_MakeDir,
+ .deliver = yfs_deliver_fs_create_vnode,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Make a directory.
+ */
+int yfs_fs_make_dir(struct afs_fs_cursor *fc,
+ const char *name,
+ umode_t mode,
+ u64 current_data_version,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus,
+ struct afs_callback *newcb)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ size_t namesz, reqsz, rplsz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ reqsz = (sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ sizeof(struct yfs_xdr_YFSStoreStatus));
+ rplsz = (sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+
+ call = afs_alloc_flat_call(net, &yfs_RXFSMakeDir, reqsz, rplsz);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = newfid;
+ call->reply[2] = newstatus;
+ call->reply[3] = newcb;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSMAKEDIR);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.RemoveFile2 operation.
+ */
+static int yfs_deliver_fs_remove_file2(struct afs_call *call)
+{
+ struct afs_vnode *dvnode = call->reply[0];
+ struct afs_vnode *vnode = call->reply[1];
+ struct afs_fid fid;
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+
+ xdr_decode_YFSFid(&bp, &fid);
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ /* Was deleted if vnode->status.abort_code == VNOVNODE. */
+
+ xdr_decode_YFSVolSync(&bp, NULL);
+ return 0;
+}
+
+/*
+ * YFS.RemoveFile2 operation type.
+ */
+static const struct afs_call_type yfs_RXYFSRemoveFile2 = {
+ .name = "YFS.RemoveFile2",
+ .op = yfs_FS_RemoveFile2,
+ .deliver = yfs_deliver_fs_remove_file2,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Remove a file and retrieve new file status.
+ */
+int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, u64 current_data_version)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(dvnode);
+ size_t namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSRemoveFile2,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSREMOVEFILE2);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.RemoveFile or YFS.RemoveDir operation.
+ */
+static int yfs_deliver_fs_remove(struct afs_call *call)
+{
+ struct afs_vnode *dvnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+
+ xdr_decode_YFSVolSync(&bp, NULL);
+ return 0;
+}
+
+/*
+ * FS.RemoveDir and FS.RemoveFile operation types.
+ */
+static const struct afs_call_type yfs_RXYFSRemoveFile = {
+ .name = "YFS.RemoveFile",
+ .op = yfs_FS_RemoveFile,
+ .deliver = yfs_deliver_fs_remove,
+ .destructor = afs_flat_call_destructor,
+};
+
+static const struct afs_call_type yfs_RXYFSRemoveDir = {
+ .name = "YFS.RemoveDir",
+ .op = yfs_FS_RemoveDir,
+ .deliver = yfs_deliver_fs_remove,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, bool isdir, u64 current_data_version)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(dvnode);
+ size_t namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ call = afs_alloc_flat_call(
+ net, isdir ? &yfs_RXYFSRemoveDir : &yfs_RXYFSRemoveFile,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, isdir ? YFSREMOVEDIR : YFSREMOVEFILE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.Link operation.
+ */
+static int yfs_deliver_fs_link(struct afs_call *call)
+{
+ struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.Link operation type.
+ */
+static const struct afs_call_type yfs_RXYFSLink = {
+ .name = "YFS.Link",
+ .op = yfs_FS_Link,
+ .deliver = yfs_deliver_fs_link,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Make a hard link.
+ */
+int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+ const char *name, u64 current_data_version)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ size_t namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ call = afs_alloc_flat_call(net, &yfs_RXYFSLink,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSLINK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.Symlink operation.
+ */
+static int yfs_deliver_fs_symlink(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_YFSFid(&bp, call->reply[1]);
+ ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.Symlink operation type
+ */
+static const struct afs_call_type yfs_RXYFSSymlink = {
+ .name = "YFS.Symlink",
+ .op = yfs_FS_Symlink,
+ .deliver = yfs_deliver_fs_symlink,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Create a symbolic link.
+ */
+int yfs_fs_symlink(struct afs_fs_cursor *fc,
+ const char *name,
+ const char *contents,
+ u64 current_data_version,
+ struct afs_fid *newfid,
+ struct afs_file_status *newstatus)
+{
+ struct afs_vnode *dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(dvnode);
+ size_t namesz, contents_sz;
+ __be32 *bp;
+
+ _enter("");
+
+ namesz = strlen(name);
+ contents_sz = strlen(contents);
+ call = afs_alloc_flat_call(net, &yfs_RXYFSSymlink,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(namesz) +
+ xdr_strlen(contents_sz) +
+ sizeof(struct yfs_xdr_YFSStoreStatus),
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = dvnode;
+ call->reply[1] = newfid;
+ call->reply[2] = newstatus;
+ call->expected_version = current_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSYMLINK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &dvnode->fid);
+ bp = xdr_encode_string(bp, name, namesz);
+ bp = xdr_encode_string(bp, contents, contents_sz);
+ bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.Rename operation.
+ */
+static int yfs_deliver_fs_rename(struct afs_call *call)
+{
+ struct afs_vnode *orig_dvnode = call->reply[0];
+ struct afs_vnode *new_dvnode = call->reply[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ if (new_dvnode != orig_dvnode) {
+ ret = yfs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
+ &call->expected_version_2, NULL);
+ if (ret < 0)
+ return ret;
+ }
+
+ xdr_decode_YFSVolSync(&bp, NULL);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.Rename operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename = {
+ .name = "FS.Rename",
+ .op = yfs_FS_Rename,
+ .deliver = yfs_deliver_fs_rename,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory.
+ */
+int yfs_fs_rename(struct afs_fs_cursor *fc,
+ const char *orig_name,
+ struct afs_vnode *new_dvnode,
+ const char *new_name,
+ u64 current_orig_data_version,
+ u64 current_new_data_version)
+{
+ struct afs_vnode *orig_dvnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(orig_dvnode);
+ size_t o_namesz, n_namesz;
+ __be32 *bp;
+
+ _enter("");
+
+ o_namesz = strlen(orig_name);
+ n_namesz = strlen(new_name);
+ call = afs_alloc_flat_call(net, &yfs_RXYFSRename,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(o_namesz) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(n_namesz),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = orig_dvnode;
+ call->reply[1] = new_dvnode;
+ call->expected_version = current_orig_data_version + 1;
+ call->expected_version_2 = current_new_data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvnode->fid);
+ bp = xdr_encode_string(bp, orig_name, o_namesz);
+ bp = xdr_encode_YFSFid(bp, &new_dvnode->fid);
+ bp = xdr_encode_string(bp, new_name, n_namesz);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &orig_dvnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.StoreData64 operation.
+ */
+static int yfs_deliver_fs_store_data(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("");
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ afs_pages_written_back(vnode, call);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.StoreData64 operation type.
+ */
+static const struct afs_call_type yfs_RXYFSStoreData64 = {
+ .name = "YFS.StoreData64",
+ .op = yfs_FS_StoreData64,
+ .deliver = yfs_deliver_fs_store_data,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Store a set of pages to a large file.
+ */
+int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
+ pgoff_t first, pgoff_t last,
+ unsigned offset, unsigned to)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ loff_t size, pos, i_size;
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ size = (loff_t)to - (loff_t)offset;
+ if (first != last)
+ size += (loff_t)(last - first) << PAGE_SHIFT;
+ pos = (loff_t)first << PAGE_SHIFT;
+ pos += offset;
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (pos + size > i_size)
+ i_size = size + pos;
+
+ _debug("size %llx, at %llx, i_size %llx",
+ (unsigned long long)size, (unsigned long long)pos,
+ (unsigned long long)i_size);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64,
+ sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSStoreStatus) +
+ sizeof(struct yfs_xdr_u64) * 3,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->mapping = mapping;
+ call->reply[0] = vnode;
+ call->first = first;
+ call->last = last;
+ call->first_offset = offset;
+ call->last_to = to;
+ call->send_pages = true;
+ call->expected_version = vnode->status.data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSTOREDATA64);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_YFSStoreStatus_mtime(bp, &vnode->vfs_inode.i_mtime);
+ bp = xdr_encode_u64(bp, pos);
+ bp = xdr_encode_u64(bp, size);
+ bp = xdr_encode_u64(bp, i_size);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * deliver reply data to an FS.StoreStatus
+ */
+static int yfs_deliver_fs_store_status(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("");
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.StoreStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSStoreStatus = {
+ .name = "YFS.StoreStatus",
+ .op = yfs_FS_StoreStatus,
+ .deliver = yfs_deliver_fs_store_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = {
+ .name = "YFS.StoreData64",
+ .op = yfs_FS_StoreData64,
+ .deliver = yfs_deliver_fs_store_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Set the attributes on a file, using YFS.StoreData64 rather than
+ * YFS.StoreStatus so as to alter the file size also.
+ */
+static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSStoreData64_as_Status,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSStoreStatus) +
+ sizeof(struct yfs_xdr_u64) * 3,
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->expected_version = vnode->status.data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSTOREDATA64);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_YFS_StoreStatus(bp, attr);
+ bp = xdr_encode_u64(bp, 0); /* position of start of write */
+ bp = xdr_encode_u64(bp, 0); /* size of write */
+ bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Set the attributes on a file, using YFS.StoreData64 if there's a change in
+ * file size, and YFS.StoreStatus otherwise.
+ */
+int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ return yfs_fs_setattr_size(fc, attr);
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSStoreStatus,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSStoreStatus),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->expected_version = vnode->status.data_version;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSTORESTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_YFS_StoreStatus(bp, attr);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to a YFS.GetVolumeStatus operation.
+ */
+static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
+{
+ const __be32 *bp;
+ char *p;
+ u32 size;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ call->unmarshall++;
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus));
+
+ /* extract the returned status record */
+ case 1:
+ _debug("extract status");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ xdr_decode_YFSFetchVolumeStatus(&bp, call->reply[1]);
+ call->unmarshall++;
+ afs_extract_to_tmp(call);
+
+ /* extract the volume name length */
+ case 2:
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("volname length: %u", call->count);
+ if (call->count >= AFSNAMEMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_volname_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
+ call->unmarshall++;
+
+ /* extract the volume name */
+ case 3:
+ _debug("extract volname");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ p = call->reply[2];
+ p[call->count] = 0;
+ _debug("volname '%s'", p);
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* extract the offline message length */
+ case 4:
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("offline msg length: %u", call->count);
+ if (call->count >= AFSNAMEMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_offline_msg_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
+ call->unmarshall++;
+
+ /* extract the offline message */
+ case 5:
+ _debug("extract offline");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ p = call->reply[2];
+ p[call->count] = 0;
+ _debug("offline '%s'", p);
+
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* extract the message of the day length */
+ case 6:
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ call->count = ntohl(call->tmp);
+ _debug("motd length: %u", call->count);
+ if (call->count >= AFSNAMEMAX)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_motd_len);
+ size = (call->count + 3) & ~3; /* It's padded */
+ afs_extract_begin(call, call->reply[2], size);
+ call->unmarshall++;
+
+ /* extract the message of the day */
+ case 7:
+ _debug("extract motd");
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ p = call->reply[2];
+ p[call->count] = 0;
+ _debug("motd '%s'", p);
+
+ call->unmarshall++;
+
+ case 8:
+ break;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * Destroy a YFS.GetVolumeStatus call.
+ */
+static void yfs_get_volume_status_call_destructor(struct afs_call *call)
+{
+ kfree(call->reply[2]);
+ call->reply[2] = NULL;
+ afs_flat_call_destructor(call);
+}
+
+/*
+ * YFS.GetVolumeStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSGetVolumeStatus = {
+ .name = "YFS.GetVolumeStatus",
+ .op = yfs_FS_GetVolumeStatus,
+ .deliver = yfs_deliver_fs_get_volume_status,
+ .destructor = yfs_get_volume_status_call_destructor,
+};
+
+/*
+ * fetch the status of a volume
+ */
+int yfs_fs_get_volume_status(struct afs_fs_cursor *fc,
+ struct afs_volume_status *vs)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+ void *tmpbuf;
+
+ _enter("");
+
+ tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
+ if (!tmpbuf)
+ return -ENOMEM;
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_u64),
+ sizeof(struct yfs_xdr_YFSFetchVolumeStatus) +
+ sizeof(__be32));
+ if (!call) {
+ kfree(tmpbuf);
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+ call->reply[1] = vs;
+ call->reply[2] = tmpbuf;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_u64(bp, vnode->fid.vid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an YFS.SetLock, YFS.ExtendLock or YFS.ReleaseLock
+ */
+static int yfs_deliver_fs_xxxx_lock(struct afs_call *call)
+{
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSVolSync(&bp, NULL);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.SetLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSSetLock = {
+ .name = "YFS.SetLock",
+ .op = yfs_FS_SetLock,
+ .deliver = yfs_deliver_fs_xxxx_lock,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.ExtendLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSExtendLock = {
+ .name = "YFS.ExtendLock",
+ .op = yfs_FS_ExtendLock,
+ .deliver = yfs_deliver_fs_xxxx_lock,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.ReleaseLock operation type
+ */
+static const struct afs_call_type yfs_RXYFSReleaseLock = {
+ .name = "YFS.ReleaseLock",
+ .op = yfs_FS_ReleaseLock,
+ .deliver = yfs_deliver_fs_xxxx_lock,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Set a lock on a file
+ */
+int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSSetLock,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(__be32),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSSETLOCK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ bp = xdr_encode_u32(bp, type);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * extend a lock on a file
+ */
+int yfs_fs_extend_lock(struct afs_fs_cursor *fc)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSExtendLock,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSEXTENDLOCK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * release a lock on a file
+ */
+int yfs_fs_release_lock(struct afs_fs_cursor *fc)
+{
+ struct afs_vnode *vnode = fc->vnode;
+ struct afs_call *call;
+ struct afs_net *net = afs_v2net(vnode);
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSReleaseLock,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return -ENOMEM;
+
+ call->key = fc->key;
+ call->reply[0] = vnode;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRELEASELOCK);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &vnode->fid);
+ yfs_check_req(call, bp);
+
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &vnode->fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.FetchStatus with no vnode.
+ */
+static int yfs_deliver_fs_fetch_status(struct afs_call *call)
+{
+ struct afs_file_status *status = call->reply[1];
+ struct afs_callback *callback = call->reply[2];
+ struct afs_volsync *volsync = call->reply[3];
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ ret = yfs_decode_status(call, &bp, status, vnode,
+ &call->expected_version, NULL);
+ if (ret < 0)
+ return ret;
+ xdr_decode_YFSCallBack_raw(&bp, callback);
+ xdr_decode_YFSVolSync(&bp, volsync);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * YFS.FetchStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSFetchStatus = {
+ .name = "YFS.FetchStatus",
+ .op = yfs_FS_FetchStatus,
+ .deliver = yfs_deliver_fs_fetch_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a fid without needing a vnode handle.
+ */
+int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
+ struct afs_net *net,
+ struct afs_fid *fid,
+ struct afs_file_status *status,
+ struct afs_callback *callback,
+ struct afs_volsync *volsync)
+{
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter(",%x,{%llx:%llu},,",
+ key_serial(fc->key), fid->vid, fid->vnode);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSFetchStatus,
+ sizeof(__be32) * 2 +
+ sizeof(struct yfs_xdr_YFSFid),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSCallBack) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[1] = status;
+ call->reply[2] = callback;
+ call->reply[3] = volsync;
+ call->expected_version = 1; /* vnode->status.data_version */
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSFETCHSTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, fid);
+ yfs_check_req(call, bp);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an YFS.InlineBulkStatus call
+ */
+static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
+{
+ struct afs_file_status *statuses;
+ struct afs_callback *callbacks;
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ u32 tmp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ afs_extract_to_tmp(call);
+ call->unmarshall++;
+
+ /* Extract the file status count and array in two steps */
+ case 1:
+ _debug("extract status count");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ tmp = ntohl(call->tmp);
+ _debug("status count: %u/%u", tmp, call->count2);
+ if (tmp != call->count2)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_count);
+
+ call->count = 0;
+ call->unmarshall++;
+ more_counts:
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus));
+
+ case 2:
+ _debug("extract status array %u", call->count);
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ statuses = call->reply[1];
+ ret = yfs_decode_status(call, &bp, &statuses[call->count],
+ call->count == 0 ? vnode : NULL,
+ NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ call->count++;
+ if (call->count < call->count2)
+ goto more_counts;
+
+ call->count = 0;
+ call->unmarshall++;
+ afs_extract_to_tmp(call);
+
+ /* Extract the callback count and array in two steps */
+ case 3:
+ _debug("extract CB count");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ tmp = ntohl(call->tmp);
+ _debug("CB count: %u", tmp);
+ if (tmp != call->count2)
+ return afs_protocol_error(call, -EBADMSG,
+ afs_eproto_ibulkst_cb_count);
+ call->count = 0;
+ call->unmarshall++;
+ more_cbs:
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack));
+
+ case 4:
+ _debug("extract CB array");
+ ret = afs_extract_data(call, true);
+ if (ret < 0)
+ return ret;
+
+ _debug("unmarshall CB array");
+ bp = call->buffer;
+ callbacks = call->reply[2];
+ xdr_decode_YFSCallBack_raw(&bp, &callbacks[call->count]);
+ statuses = call->reply[1];
+ if (call->count == 0 && vnode && statuses[0].abort_code == 0) {
+ bp = call->buffer;
+ xdr_decode_YFSCallBack(call, vnode, &bp);
+ }
+ call->count++;
+ if (call->count < call->count2)
+ goto more_cbs;
+
+ afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync));
+ call->unmarshall++;
+
+ case 5:
+ ret = afs_extract_data(call, false);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ xdr_decode_YFSVolSync(&bp, call->reply[3]);
+
+ call->unmarshall++;
+
+ case 6:
+ break;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.InlineBulkStatus operation type
+ */
+static const struct afs_call_type yfs_RXYFSInlineBulkStatus = {
+ .name = "YFS.InlineBulkStatus",
+ .op = yfs_FS_InlineBulkStatus,
+ .deliver = yfs_deliver_fs_inline_bulk_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for up to 1024 files
+ */
+int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
+ struct afs_net *net,
+ struct afs_fid *fids,
+ struct afs_file_status *statuses,
+ struct afs_callback *callbacks,
+ unsigned int nr_fids,
+ struct afs_volsync *volsync)
+{
+ struct afs_call *call;
+ __be32 *bp;
+ int i;
+
+ _enter(",%x,{%llx:%llu},%u",
+ key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
+
+ call = afs_alloc_flat_call(net, &yfs_RXYFSInlineBulkStatus,
+ sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_YFSFid) * nr_fids,
+ sizeof(struct yfs_xdr_YFSFetchStatus));
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[1] = statuses;
+ call->reply[2] = callbacks;
+ call->reply[3] = volsync;
+ call->count2 = nr_fids;
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS);
+ bp = xdr_encode_u32(bp, 0); /* RPCFlags */
+ bp = xdr_encode_u32(bp, nr_fids);
+ for (i = 0; i < nr_fids; i++)
+ bp = xdr_encode_YFSFid(bp, &fids[i]);
+ yfs_check_req(call, bp);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &fids[0]);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
diff --git a/fs/aio.c b/fs/aio.c
index b9350f3360c6..301e6314183b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2135,12 +2135,12 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
compat_long_t, min_nr,
compat_long_t, nr,
struct io_event __user *, events,
- struct compat_timespec __user *, timeout)
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
- if (timeout && compat_get_timespec64(&t, timeout))
+ if (timeout && get_old_timespec32(&t, timeout))
return -EFAULT;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
@@ -2160,7 +2160,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
compat_long_t, min_nr,
compat_long_t, nr,
struct io_event __user *, events,
- struct compat_timespec __user *, timeout,
+ struct old_timespec32 __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
@@ -2168,7 +2168,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct timespec64 t;
int ret;
- if (timeout && compat_get_timespec64(&t, timeout))
+ if (timeout && get_old_timespec32(&t, timeout))
return -EFAULT;
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index efae2fb0930a..54207327f98f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1580,7 +1580,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
- const siginfo_t *siginfo)
+ const kernel_siginfo_t *siginfo)
{
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
@@ -1782,7 +1782,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const siginfo_t *siginfo, struct pt_regs *regs)
+ const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -2031,7 +2031,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const siginfo_t *siginfo, struct pt_regs *regs)
+ const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
struct list_head *t;
struct core_thread *ct;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38b8ce05cbc7..a80b4f0ee7c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
dio->size = 0;
dio->multi_bio = false;
- dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+ dio->should_dirty = is_read && iter_is_iovec(iter);
blk_start_plug(&plug);
for (;;) {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1574a2..68ebe188446a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
}
struct preftree {
- struct rb_root root;
+ struct rb_root_cached root;
unsigned int count;
};
-#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 }
+#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
struct preftrees {
struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
struct prelim_ref *newref,
struct share_check *sc)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct rb_node **p;
struct rb_node *parent = NULL;
struct prelim_ref *ref;
int result;
+ bool leftmost = true;
root = &preftree->root;
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
while (*p) {
parent = *p;
@@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
p = &(*p)->rb_left;
} else if (result > 0) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
/* Identical refs, merge them and free @newref */
struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
- rb_insert_color(&newref->rbnode, root);
+ rb_insert_color_cached(&newref->rbnode, root, leftmost);
}
/*
@@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree)
{
struct prelim_ref *ref, *next_ref;
- rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
- rbnode)
+ rbtree_postorder_for_each_entry_safe(ref, next_ref,
+ &preftree->root.rb_root, rbnode)
free_pref(ref);
- preftree->root = RB_ROOT;
+ preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
}
@@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* freeing the entire indirect tree when we're done. In some test
* cases, the tree can grow quite large (~200k objects).
*/
- while ((rnode = rb_first(&preftrees->indirect.root))) {
+ while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
goto out;
}
- rb_erase(&ref->rbnode, &preftrees->indirect.root);
+ rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
preftrees->indirect.count--;
if (ref->count == 0) {
@@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct preftree *tree = &preftrees->indirect_missing_keys;
struct rb_node *node;
- while ((node = rb_first(&tree->root))) {
+ while ((node = rb_first_cached(&tree->root))) {
ref = rb_entry(node, struct prelim_ref, rbnode);
- rb_erase(node, &tree->root);
+ rb_erase_cached(node, &tree->root);
BUG_ON(ref->parent); /* should not be a direct ref */
BUG_ON(ref->key_for_search.type);
@@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
spin_lock(&head->lock);
- for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+ for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
if (node->seq > seq)
@@ -1229,14 +1231,14 @@ again:
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
extent_item_pos, total_refs, sc, ignore_offset);
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
/*
* This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1247,7 @@ again:
*
* We release the entire tree in one go before returning.
*/
- node = rb_first(&preftrees.direct.root);
+ node = rb_first_cached(&preftrees.direct.root);
while (node) {
ref = rb_entry(node, struct prelim_ref, rbnode);
node = rb_next(&ref->rbnode);
@@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
struct seq_list elem = SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
- .root_objectid = root->objectid,
+ .root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
};
@@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
- cur, found_key.objectid, fs_root->objectid);
+ cur, found_key.objectid,
+ fs_root->root_key.objectid);
ret = iterate(parent, name_len,
(unsigned long)(iref + 1), eb, ctx);
if (ret)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..97d91e55b70a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
{
- u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+ u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
#if BITS_PER_LONG == 32
h = (h >> 32) ^ (h & 0xffffffff);
@@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
/* Output minus objectid, which is more meaningful */
- if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 833cf3c35b4d..2e43fba44035 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
{
unsigned int num_pages;
unsigned int i;
+ size_t size;
u64 dev_bytenr;
int ret;
@@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
- block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) +
- sizeof(*block_ctx->pagev),
- num_pages, GFP_NOFS);
+ size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
+ block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
if (!block_ctx->mem_to_free)
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa66592aa7..2955a4ea2fa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
@@ -528,7 +526,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree;
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned long compressed_len;
@@ -545,7 +542,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int faili = 0;
u32 *sums;
- tree = &BTRFS_I(inode)->io_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d436fb4c002e..539901fb5165 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
}
}
-/*
- * reset all the locked nodes in the patch to spinning locks.
- *
- * held is used to keep lockdep happy, when lockdep is enabled
- * we set held to a blocking lock before we go around and
- * retake all the spinlocks in the path. You can safely use NULL
- * for held
- */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw)
-{
- int i;
-
- if (held) {
- btrfs_set_lock_blocking_rw(held, held_rw);
- if (held_rw == BTRFS_WRITE_LOCK)
- held_rw = BTRFS_WRITE_LOCK_BLOCKING;
- else if (held_rw == BTRFS_READ_LOCK)
- held_rw = BTRFS_READ_LOCK_BLOCKING;
- }
- btrfs_set_path_blocking(p);
-
- for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
- if (p->nodes[i] && p->locks[i]) {
- btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
- p->locks[i] = BTRFS_WRITE_LOCK;
- else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
- p->locks[i] = BTRFS_READ_LOCK;
- }
- }
-
- if (held)
- btrfs_clear_lock_blocking_rw(held, held_rw);
-}
-
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -1050,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
+ /*
+ * If we are COWing a node/leaf from the extent, chunk or device trees,
+ * make sure that we do not finish block group creation of pending block
+ * groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk and device trees, therefore we could
+ * deadlock with ourselves since we are holding a lock on an extent
+ * buffer that btrfs_create_pending_block_groups() may try to COW later.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root)
+ trans->can_flush_pending_bgs = false;
+
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size);
+ trans->can_flush_pending_bgs = true;
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1306,7 +1287,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
}
}
- btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1815,8 +1795,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
int orig_slot = path->slots[level];
u64 orig_ptr;
- if (level == 0)
- return 0;
+ ASSERT(level > 0);
mid = path->nodes[level];
@@ -2483,7 +2462,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(sret > 0);
if (sret) {
@@ -2504,7 +2482,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
if (sret) {
ret = sret;
@@ -2789,7 +2766,10 @@ again:
}
cow_done:
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
+ /*
+ * Leave path with blocking locks to avoid massive
+ * lock context switch, this is made on purpose.
+ */
/*
* we have a lock on b and as long as we aren't changing
@@ -2871,8 +2851,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_WRITE_LOCK);
}
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
@@ -2880,8 +2858,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -2900,7 +2876,6 @@ cow_done:
btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(err > 0);
if (err) {
@@ -2910,7 +2885,7 @@ cow_done:
}
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
+ min_write_lock_level, NULL);
goto done;
}
}
@@ -2961,13 +2936,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
level = btrfs_header_level(b);
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
/*
* we have a lock on b and as long as we aren't changing
@@ -3013,8 +2991,6 @@ again:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
@@ -5198,7 +5174,6 @@ find_next_key:
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1, 0, NULL);
- btrfs_clear_path_blocking(path, NULL, 0);
}
out:
path->keep_locks = keep_locks;
@@ -5783,8 +5758,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5820,8 +5793,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53af9f5253f4..68ca41dbbef3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define STATIC noinline
-#else
-#define STATIC static noinline
-#endif
-
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -367,11 +361,13 @@ struct btrfs_dev_replace {
struct mutex lock_finishing_cancel_unmount;
rwlock_t lock;
- atomic_t read_locks;
atomic_t blocking_readers;
wait_queue_head_t read_lock_wq;
struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
};
/* For raid type sysfs entries */
@@ -1094,9 +1090,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- struct percpu_counter bio_counter;
- wait_queue_head_t replace_wait;
-
struct semaphore uuid_tree_rescan_sem;
/* Used to reclaim the metadata space in the background. */
@@ -1202,18 +1195,12 @@ struct btrfs_root {
int last_log_commit;
pid_t log_start_pid;
- u64 objectid;
u64 last_trans;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
- u64 alloc_bytenr;
-#endif
-
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1280,11 +1267,16 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
+ atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ u64 alloc_bytenr;
+#endif
};
struct btrfs_file_private {
@@ -2606,10 +2598,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2770,7 +2760,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
- int update_size);
+ bool update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
@@ -2876,8 +2866,6 @@ void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3020,8 +3008,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
int name_len, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@ -3179,8 +3166,8 @@ void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end, int create);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -3200,9 +3187,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
-#endif
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3390,9 +3374,9 @@ do { \
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#endif
@@ -3404,6 +3388,13 @@ do { \
rcu_read_unlock(); \
} while (0)
+#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_no_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
@@ -3708,18 +3699,19 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-#endif
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
- return 1;
-#endif
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
return 0;
}
+#endif
static inline void cond_wake_up(struct wait_queue_head *wq)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f51b509f2d9b..c669f250d4a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
- delayed_node->ins_root = RB_ROOT;
- delayed_node->del_root = RB_ROOT;
+ delayed_node->ins_root = RB_ROOT_CACHED;
+ delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
@@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node,
struct btrfs_key *key)
{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
NULL, NULL);
}
@@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_item *item;
int cmp;
+ bool leftmost = true;
if (action == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
@@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
root = &delayed_node->del_root;
else
BUG();
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
node = &ins->rb_node;
while (*p) {
@@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_node);
cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0)
+ if (cmp < 0) {
p = &(*p)->rb_right;
- else if (cmp > 0)
+ leftmost = false;
+ } else if (cmp > 0) {
p = &(*p)->rb_left;
- else
+ } else {
return -EEXIST;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
ins->delayed_node = delayed_node;
ins->ins_or_del = action;
@@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
@@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
else
root = &delayed_item->delayed_node->del_root;
- rb_erase(&delayed_item->rb_node, root);
+ rb_erase_cached(&delayed_item->rb_node, root);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->ins_root);
+ p = rb_first_cached(&delayed_node->ins_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->del_root);
+ p = rb_first_cached(&delayed_node->del_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -559,7 +562,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
* reserved space when starting a transaction. So no need to reserve
* qgroup space here.
*/
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
@@ -647,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
@@ -762,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
i++;
}
- /* reset all the locked nodes in the patch to spinning locks. */
- btrfs_clear_path_blocking(path, NULL, 0);
-
/* insert the keys of the items */
setup_items_for_insert(root, path, keys, data_size,
total_data_size, total_size, nitems);
@@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- name_len, name, delayed_node->root->objectid,
+ name_len, name, delayed_node->root->root_key.objectid,
delayed_node->inode_id, ret);
BUG();
}
@@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->objectid, node->inode_id, ret);
+ index, node->root->root_key.objectid,
+ node->inode_id, ret);
BUG();
}
mutex_unlock(&node->mutex);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 33536cd681d4..74ae226ffaf0 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -50,8 +50,8 @@ struct btrfs_delayed_node {
* is waiting to be dealt with by the async worker.
*/
struct list_head p_list;
- struct rb_root ins_root;
- struct rb_root del_root;
+ struct rb_root_cached ins_root;
+ struct rb_root_cached del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545ba1f7..9301b3ad9217 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
}
/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
struct rb_node *node)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_head *entry;
struct btrfs_delayed_ref_head *ins;
u64 bytenr;
+ bool leftmost = true;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
bytenr = ins->bytenr;
@@ -117,26 +118,29 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
-static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_node *entry;
+ bool leftmost = true;
while (*p) {
int comp;
@@ -145,29 +149,46 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
ref_node);
comp = comp_refs(ins, entry, true);
- if (comp < 0)
+ if (comp < 0) {
p = &(*p)->rb_left;
- else if (comp > 0)
+ } else if (comp > 0) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+ struct btrfs_delayed_ref_root *dr)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = rb_first_cached(&dr->href_root);
+ if (!n)
+ return NULL;
+
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ return entry;
+}
+
/*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot. If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
*/
-static struct btrfs_delayed_ref_head *
-find_ref_head(struct rb_root *root, u64 bytenr,
- int return_bigger)
+static struct btrfs_delayed_ref_head *find_ref_head(
+ struct btrfs_delayed_ref_root *dr, u64 bytenr,
+ bool return_bigger)
{
+ struct rb_root *root = &dr->href_root.rb_root;
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
@@ -187,22 +208,18 @@ find_ref_head(struct rb_root *root, u64 bytenr,
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
- n = rb_first(root);
+ return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- return entry;
}
return entry;
}
return NULL;
}
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- struct btrfs_delayed_ref_root *delayed_refs;
-
- delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;
@@ -227,7 +244,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref)
{
lockdep_assert_held(&head->lock);
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -296,7 +313,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
lockdep_assert_held(&head->lock);
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return;
/* We don't have too many refs to merge for data. */
@@ -314,7 +331,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&fs_info->tree_mod_seq_lock);
again:
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
@@ -345,40 +363,29 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
return ret;
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs)
{
- struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- u64 start;
- bool loop = false;
-
- delayed_refs = &trans->transaction->delayed_refs;
again:
- start = delayed_refs->run_delayed_start;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
- if (!head && !loop) {
+ head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+ true);
+ if (!head && delayed_refs->run_delayed_start != 0) {
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
- if (!head)
- return NULL;
- } else if (!head && loop) {
- return NULL;
+ head = find_first_ref_head(delayed_refs);
}
+ if (!head)
+ return NULL;
while (head->processing) {
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
- if (loop)
+ if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -569,7 +576,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
- head_ref->ref_tree = RB_ROOT;
+ head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
@@ -903,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
- return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ return find_ref_head(delayed_refs, bytenr, false);
}
void __cold btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d9f2a4ebd5db..8e20c5cb5404 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_tree;
+ struct rb_root_cached ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
@@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_root {
/* head ref rbtree */
- struct rb_root href_root;
+ struct rb_root_cached href_root;
/* dirty extent records */
struct rb_root dirty_extent_root;
@@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
@@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dec01970d8c5..2aa48aecc52b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -382,14 +382,6 @@ out:
return ret;
}
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-
- dev_replace->committed_cursor_left =
- dev_replace->cursor_left_last_write_of_item;
-}
-
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
@@ -408,11 +400,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
+ bool need_unlock;
- ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
- srcdev_name, &src_device);
- if (ret)
- return ret;
+ src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
+ srcdev_name);
+ if (IS_ERR(src_device))
+ return PTR_ERR(src_device);
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
@@ -432,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -440,6 +434,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
@@ -470,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
btrfs_dev_replace_write_unlock(dev_replace);
+ need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -481,7 +477,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
goto leave;
}
@@ -503,9 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
- btrfs_dev_replace_write_unlock(dev_replace);
+ if (need_unlock)
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wait_event(fs_info->replace_wait, !percpu_counter_sum(
- &fs_info->bio_counter));
+ wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
+ &fs_info->dev_replace.bio_counter));
}
/*
@@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->dev_replace.replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -961,13 +961,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
{
read_lock(&dev_replace->lock);
- atomic_inc(&dev_replace->read_locks);
}
void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- atomic_dec(&dev_replace->read_locks);
read_unlock(&dev_replace->lock);
}
@@ -985,7 +982,6 @@ again:
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
write_unlock(&dev_replace->lock);
}
@@ -994,45 +990,31 @@ void btrfs_dev_replace_set_lock_blocking(
struct btrfs_dev_replace *dev_replace)
{
/* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
atomic_inc(&dev_replace->blocking_readers);
read_unlock(&dev_replace->lock);
}
-/* acquire read lock and dec blocking cnt */
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace)
-{
- /* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
- read_lock(&dev_replace->lock);
- /* Barrier implied by atomic_dec_and_test */
- if (atomic_dec_and_test(&dev_replace->blocking_readers))
- cond_wake_up_nomb(&dev_replace->read_lock_wq);
-}
-
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
- percpu_counter_sub(&fs_info->bio_counter, amount);
- cond_wake_up_nomb(&fs_info->replace_wait);
+ percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
+ cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
while (1) {
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state)))
break;
btrfs_bio_counter_dec(fs_info);
- wait_event(fs_info->replace_wait,
+ wait_event(fs_info->dev_replace.replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index b6d4206188bb..795c551f5b5e 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
@@ -28,12 +27,5 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace);
-
-static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
-{
- atomic64_inc(stat_value);
-}
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a678b07fcf01..8de74d835dba 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, const char *name, int name_len,
- struct btrfs_inode *dir, struct btrfs_key *location,
- u8 type, u64 index)
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
+ int name_len, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
+ struct btrfs_root *root = dir->root;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5124c15705ce..b0ab41da91d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -125,8 +125,8 @@ struct async_submit_bio {
* Different roots are used for different purposes and may nest inside each
* other and they require separate keysets. As lockdep keys should be
* static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->objectid. This ensures that all special purpose roots
- * have separate keysets.
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
*
* Lock-nesting across peer nodes is always done with the immediate parent
* node locked thus preventing deadlock. As lockdep doesn't know this, use
@@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
root->orphan_cleanup_state = 0;
- root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
root->nr_delalloc_inodes = 0;
@@ -1187,6 +1186,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
+ atomic_set(&root->snapshot_force_cow, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -2155,9 +2155,8 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
rwlock_init(&fs_info->dev_replace.lock);
- atomic_set(&fs_info->dev_replace.read_locks, 0);
atomic_set(&fs_info->dev_replace.blocking_readers, 0);
- init_waitqueue_head(&fs_info->replace_wait);
+ init_waitqueue_head(&fs_info->dev_replace.replace_wait);
init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
@@ -2647,7 +2646,8 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -3308,7 +3308,7 @@ fail_iput:
iput(fs_info->btree_inode);
fail_bio_counter:
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3976,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
+ ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
btrfs_free_qgroup_config(fs_info);
@@ -4017,7 +4018,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
@@ -4203,7 +4204,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
- while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+ while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
@@ -4221,11 +4222,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((n = rb_first(&head->ref_tree)) != NULL) {
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -4239,7 +4240,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1f3755b3a37a..ddf28ecf17f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
fid->gen = inode->i_generation;
if (parent) {
@@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->objectid;
+ parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6f75f5547b..a1febf155747 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
insert_reserved);
else
BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans->fs_info, node->bytenr,
+ node->num_bytes, 1);
return ret;
}
@@ -2374,7 +2377,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
@@ -2387,7 +2390,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
- ref = rb_entry(rb_first(&head->ref_tree),
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
@@ -2448,13 +2451,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&head->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
return 1;
}
delayed_refs->num_heads--;
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -2502,102 +2505,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * Returns 0 on success or if called with an already aborted transaction.
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
- */
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_delayed_ref_head *head = NULL;
+ int ret;
+
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_select_ref_head(delayed_refs);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return head;
+ }
+
+ /*
+ * Grab the lock that says we are going to process all the refs for
+ * this head
+ */
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN)
+ head = ERR_PTR(-EAGAIN);
+
+ return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *locked_ref,
+ unsigned long *run_refs)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
- ktime_t start = ktime_get();
- int ret;
- unsigned long count = 0;
- unsigned long actual_count = 0;
+ struct btrfs_delayed_ref_node *ref;
int must_insert_reserved = 0;
+ int ret;
delayed_refs = &trans->transaction->delayed_refs;
- while (1) {
- if (!locked_ref) {
- if (count >= nr)
- break;
- spin_lock(&delayed_refs->lock);
- locked_ref = btrfs_select_ref_head(trans);
- if (!locked_ref) {
- spin_unlock(&delayed_refs->lock);
- break;
- }
+ lockdep_assert_held(&locked_ref->mutex);
+ lockdep_assert_held(&locked_ref->lock);
- /* grab the lock that says we are going to process
- * all the refs for this head */
- ret = btrfs_delayed_ref_lock(trans, locked_ref);
- spin_unlock(&delayed_refs->lock);
- /*
- * we may have dropped the spin lock to get the head
- * mutex lock, and that might have given someone else
- * time to free the head. If that's true, it has been
- * removed from our list and we can move on.
- */
- if (ret == -EAGAIN) {
- locked_ref = NULL;
- count++;
- continue;
- }
- }
-
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- * Or we can get node references of the same type that weren't
- * merged when created due to bumps in the tree mod seq, and
- * we need to merge them to prevent adding an inline extent
- * backref before dropping it (triggering a BUG_ON at
- * insert_inline_extent_backref()).
- */
- spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-
- ref = select_delayed_ref(locked_ref);
-
- if (ref && ref->seq &&
+ while ((ref = select_delayed_ref(locked_ref))) {
+ if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
- locked_ref = NULL;
- cond_resched();
- count++;
- continue;
+ return -EAGAIN;
}
- /*
- * We're done processing refs in this ref_head, clean everything
- * up and move on to the next ref_head.
- */
- if (!ref) {
- ret = cleanup_ref_head(trans, locked_ref);
- if (ret > 0 ) {
- /* We dropped our lock, we need to loop. */
- ret = 0;
- continue;
- } else if (ret) {
- return ret;
- }
- locked_ref = NULL;
- count++;
- continue;
- }
-
- actual_count++;
+ (*run_refs)++;
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -2619,8 +2586,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
atomic_dec(&delayed_refs->num_entries);
/*
- * Record the must-insert_reserved flag before we drop the spin
- * lock.
+ * Record the must_insert_reserved flag before we drop the
+ * spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
locked_ref->must_insert_reserved = 0;
@@ -2642,10 +2609,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
btrfs_put_delayed_ref(ref);
- count++;
cond_resched();
+
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
}
+ return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long nr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ ktime_t start = ktime_get();
+ int ret;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ do {
+ if (!locked_ref) {
+ locked_ref = btrfs_obtain_ref_head(trans);
+ if (IS_ERR_OR_NULL(locked_ref)) {
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ count++;
+ }
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+ &actual_count);
+ if (ret < 0 && ret != -EAGAIN) {
+ /*
+ * Error, btrfs_run_delayed_refs_for_head already
+ * unlocked everything so just bail out
+ */
+ return ret;
+ } else if (!ret) {
+ /*
+ * Success, perform the usual cleanup of a processed
+ * head
+ */
+ ret = cleanup_ref_head(trans, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
+ continue;
+ } else if (ret) {
+ return ret;
+ }
+ }
+
+ /*
+ * Either success case or btrfs_run_delayed_refs_for_head
+ * returned -EAGAIN, meaning we need to select another head
+ */
+
+ locked_ref = NULL;
+ cond_resched();
+ } while ((nr != -1 && count < nr) || locked_ref);
+
/*
* We don't want to include ref heads since we can have empty ref heads
* and those will drastically skew our runtime down since we just do
@@ -2745,9 +2792,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
@@ -2782,8 +2829,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
{
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
@@ -2791,14 +2837,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
u64 val;
smp_mb();
- avg_runtime = fs_info->avg_delayed_ref_runtime;
+ avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans, fs_info);
+ return btrfs_check_space_for_delayed_refs(trans);
}
struct async_delayed_refs {
@@ -2911,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2928,7 +2973,6 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -2940,7 +2984,7 @@ again:
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first(&delayed_refs->href_root);
+ node = rb_first_cached(&delayed_refs->href_root);
if (!node) {
spin_unlock(&delayed_refs->lock);
goto out;
@@ -2959,7 +3003,6 @@ again:
goto again;
}
out:
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* XXX: We should replace this with a proper search function in the
* future.
*/
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
{
struct btrfs_path *path;
int ret;
- int ret2;
path = btrfs_alloc_path();
if (!path)
@@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
if (ret && ret != -ENOENT)
goto out;
- ret2 = check_delayed_ref(root, path, objectid,
- offset, bytenr);
- } while (ret2 == -EAGAIN);
-
- if (ret2 && ret2 != -ENOENT) {
- ret = ret2;
- goto out;
- }
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+ } while (ret == -EAGAIN);
- if (ret != -ENOENT || ret2 != -ENOENT)
- ret = 0;
out:
btrfs_free_path(path);
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -4533,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
goto out;
} else {
ret = 1;
+ space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4554,11 +4590,9 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
btrfs_create_pending_block_groups(trans);
- btrfs_trans_release_chunk_metadata(trans);
- }
+
return ret;
}
@@ -5284,7 +5318,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
}
static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int update_size)
+ u64 num_bytes, bool update_size)
{
spin_lock(&block_rsv->lock);
block_rsv->reserved += num_bytes;
@@ -5316,7 +5350,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
- block_rsv_add_bytes(dest, num_bytes, 1);
+ block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
@@ -5479,7 +5513,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
struct btrfs_block_rsv *dst, u64 num_bytes,
- int update_size)
+ bool update_size)
{
int ret;
@@ -5539,10 +5573,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return 0;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
- return 0;
- }
+ if (!ret)
+ block_rsv_add_bytes(block_rsv, num_bytes, true);
return ret;
}
@@ -5587,7 +5619,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
}
@@ -5629,7 +5661,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), num_bytes, 1);
@@ -5800,7 +5832,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
- * qgroup_reserved: used to return the reserved size in qgroup
+ * use_global_rsv: allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot/subvolume
* creation and deletion. Those operations are different with the
@@ -5810,10 +5842,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
* the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- int items,
+ struct btrfs_block_rsv *rsv, int items,
bool use_global_rsv)
{
+ u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5821,12 +5853,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ qgroup_num_bytes = 3 * fs_info->nodesize;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root,
+ qgroup_num_bytes, true);
if (ret)
return ret;
- } else {
- num_bytes = 0;
}
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
@@ -5836,10 +5867,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
- if (ret && num_bytes)
- btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+ if (ret && qgroup_num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
return ret;
}
@@ -6400,10 +6431,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
-
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- ram_bytes, 0);
space_info->bytes_may_use -= ram_bytes;
if (delalloc)
cache->delalloc_bytes += num_bytes;
@@ -6425,11 +6452,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
* reserve set to 0 in order to clear the reservation.
*/
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc)
+static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
- int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
@@ -6437,12 +6463,12 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- return ret;
}
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
{
@@ -6926,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree))
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
if (head->extent_op) {
@@ -6947,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* at this point we have a head with no other entries. Go
* ahead and process it.
*/
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
@@ -7234,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
+ u64 max_free_space = 0;
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
@@ -7529,8 +7556,8 @@ unclustered_alloc:
spin_lock(&ctl->tree_lock);
if (ctl->free_space <
num_bytes + empty_cluster + empty_size) {
- if (ctl->free_space > max_extent_size)
- max_extent_size = ctl->free_space;
+ max_free_space = max(max_free_space,
+ ctl->free_space);
spin_unlock(&ctl->tree_lock);
goto loop;
}
@@ -7697,6 +7724,8 @@ loop:
}
out:
if (ret == -ENOSPC) {
+ if (!max_extent_size)
+ max_extent_size = max_free_space;
spin_lock(&space_info->lock);
space_info->max_extent_size = max_extent_size;
spin_unlock(&space_info->lock);
@@ -7978,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
+ if (!path)
return -ENOMEM;
- }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
return ret;
}
@@ -8120,6 +8142,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(fs_info, buf);
@@ -8216,7 +8251,7 @@ try_reserve:
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
- block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_add_bytes(block_rsv, blocksize, false);
block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
}
@@ -8643,7 +8678,13 @@ skip:
parent = 0;
}
- if (need_account) {
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
+ * already accounted them at merge time (replace_path),
+ * thus we could skip expensive subtree trace here.
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
@@ -8764,15 +8805,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(eb));
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level + 1]));
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8780,6 +8820,11 @@ out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
+
+owner_mismatch:
+ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -8833,6 +8878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
+ if (ret < 0)
+ return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
@@ -8876,7 +8923,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
path = btrfs_alloc_path();
if (!path) {
@@ -9614,6 +9661,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
+ wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
@@ -10075,15 +10123,19 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group_cache *block_group;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
- trans->can_flush_pending_bgs = false;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ if (!trans->can_flush_pending_bgs)
+ return;
+
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
if (ret)
goto next;
@@ -10104,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
list_del_init(&block_group->bg_list);
}
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
+ btrfs_trans_release_chunk_metadata(trans);
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
@@ -10754,14 +10806,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
* We don't want a transaction for this since the discard may take a
* substantial amount of time. We don't require that a transaction be
* running, but we do need to take a running transaction into account
- * to ensure that we're not discarding chunks that were released in
- * the current transaction.
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
*
* Holding the chunks lock will prevent other threads from allocating
* or releasing chunks, but it won't prevent a running transaction
* from committing and releasing the memory that the pending chunks
* list head uses. For that, we need to take a reference to the
- * transaction.
+ * transaction and hold the commit root sem. We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
*/
static int btrfs_trim_free_extents(struct btrfs_device *device,
u64 minlen, u64 *trimmed)
@@ -10771,6 +10825,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
+ /* Discard not supported = nothing to do. */
+ if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ return 0;
+
/* Not writeable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
@@ -10788,9 +10846,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
- return ret;
+ break;
- down_read(&fs_info->commit_root_sem);
+ ret = down_read_killable(&fs_info->commit_root_sem);
+ if (ret) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ break;
+ }
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
@@ -10798,13 +10860,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+
ret = find_free_dev_extent_start(trans, device, minlen, start,
&start, &len);
- if (trans)
+ if (trans) {
+ up_read(&fs_info->commit_root_sem);
btrfs_put_transaction(trans);
+ }
if (ret) {
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret == -ENOSPC)
ret = 0;
@@ -10812,7 +10878,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
}
ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
@@ -10832,6 +10897,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
return ret;
}
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error. The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -10841,18 +10915,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ u64 bg_failed = 0;
+ u64 dev_failed = 0;
+ int bg_ret = 0;
+ int dev_ret = 0;
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
- while (cache) {
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ for (; cache; cache = next_block_group(fs_info, cache)) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
break;
@@ -10866,13 +10936,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
ret = wait_block_group_cache_done(cache);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
ret = btrfs_trim_block_group(cache,
@@ -10883,28 +10955,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
-
- cache = next_block_group(fs_info, cache);
}
+ if (bg_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu block group(s), last error %d",
+ bg_failed, bg_ret);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->alloc_list;
- list_for_each_entry(device, devices, dev_alloc_list) {
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
- if (ret)
+ if (ret) {
+ dev_failed++;
+ dev_ret = ret;
break;
+ }
trimmed += group_trimmed;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (dev_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu device(s), last error %d",
+ dev_failed, dev_ret);
range->len = trimmed;
- return ret;
+ if (bg_ret)
+ return bg_ret;
+ return dev_ret;
}
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..d228f706ff3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
struct extent_state *state;
- struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
- n = rb_next(&state->rb_node);
- while (n) {
- state = rb_entry(n, struct extent_state,
- rb_node);
+ while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
- n = rb_next(n);
}
free_extent_state(*cached_state);
*cached_state = NULL;
@@ -1568,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-STATIC u64 find_lock_delalloc_range(struct inode *inode,
+static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes)
@@ -1648,6 +1643,17 @@ out_failed:
return found;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
+{
+ return find_lock_delalloc_range(inode, tree, locked_page, start, end,
+ max_bytes);
+}
+#endif
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
@@ -3778,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -3903,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
@@ -5153,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
@@ -5165,11 +5169,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_buffer *eb)
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
- int was_dirty = 0;
+ bool was_dirty;
check_buffer_tree_ref(eb);
@@ -5179,8 +5183,15 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ if (!was_dirty)
+ for (i = 0; i < num_pages; i++)
+ set_page_dirty(eb->pages[i]);
+
+#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ ASSERT(PageDirty(eb->pages[i]));
+#endif
+
return was_dirty;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e677e1d..369daa5d4f73 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_buffer *eb);
+bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
@@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-noinline u64 find_lock_delalloc_range(struct inode *inode,
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55e5339..7eea8b6e2cd3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -34,7 +34,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT;
+ tree->map = RB_ROOT_CACHED;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static int tree_insert(struct rb_root *root, struct extent_map *em)
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
+ bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start)
+ if (em->start < entry->start) {
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ } else if (em->start >= extent_map_end(entry)) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return -EEXIST;
+ }
}
orig_parent = parent;
@@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color(&em->rb_node, root);
+ rb_insert_color_cached(&em->rb_node, root, leftmost);
return 0;
}
@@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
@@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
@@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
if (!rb_node) {
if (prev)
rb_node = prev;
@@ -428,16 +431,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
*/
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
- int ret = 0;
-
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- rb_erase(&em->rb_node, &tree->map);
+ rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
- return ret;
}
void replace_extent_mapping(struct extent_map_tree *tree,
@@ -449,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
list_del_init(&cur->list);
- rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(tree, new, modified);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 25d985e7532a..31977ffd6190 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,7 +49,7 @@ struct extent_map {
};
struct extent_map_tree {
- struct rb_root map;
+ struct rb_root_cached map;
struct list_head modified_extents;
rwlock_t lock;
};
@@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *cur,
struct extent_map *new,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..97c7a086f7bd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
end_of_last_block = start_pos + num_bytes - 1;
+ /*
+ * The pages may have already been dirty, clear out old accounting so
+ * we can set things up properly
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (start_pos >= isize &&
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
- clear_extent_bit(&inode->io_tree, start_pos, last_pos,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state);
+
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
+ /*
+ * It's possible the pages are dirty right now, but we don't want
+ * to clean them yet because copy_from_user may catch a page fault
+ * and we might have to fall back to one page at a time. If that
+ * happens, we'll unlock these pages and we'd have a window where
+ * reclaim could sneak in and drop the once-dirty page on the floor
+ * without writing it.
+ *
+ * We have the pages locked and the extent range locked, so there's
+ * no way someone can start IO on any dirty pages in this range.
+ *
+ * We'll call btrfs_dirty_pages() later on, and that will flip around
+ * delalloc bits and dirty the pages as required.
+ */
for (i = 0; i < num_pages; i++) {
- if (clear_page_dirty_for_io(pages[i]))
- account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -2061,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
inode_lock(inode);
+
+ /*
+ * We take the dio_sem here because the tree log stuff can race with
+ * lockless dio writes and get an extent map logged for an extent we
+ * never waited on. We need it this high up for lockdep reasons.
+ */
+ down_write(&BTRFS_I(inode)->dio_sem);
+
atomic_inc(&root->log_batch);
/*
@@ -2069,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2092,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2110,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2131,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
/*
@@ -2544,7 +2573,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2594,7 +2623,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0adf38b00fa0..4ba0aedc878b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -10,6 +10,7 @@
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct inode *inode = NULL;
+ unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
+ /*
+ * We are often under a trans handle at this point, so we need to make
+ * sure NOFS is set to keep us from deadlocking.
+ */
+ nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
@@ -1679,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1762,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1783,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1802,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1815,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2110,8 +2129,7 @@ new_bitmap:
out:
if (info) {
- if (info->bitmap)
- kfree(info->bitmap);
+ kfree(info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}
@@ -2440,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2448,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
@@ -2676,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2714,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
@@ -3601,8 +3622,7 @@ again:
if (info)
kmem_cache_free(btrfs_free_space_cachep, info);
- if (map)
- kfree(map);
+ kfree(map);
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9357a19d2bff..d3df5b52278c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
-static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
static const struct extent_io_ops btrfs_extent_io_ops;
@@ -503,6 +502,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -1271,7 +1271,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_num_bytes;
u64 ram_bytes;
int extent_type;
- int ret, err;
+ int ret;
int type;
int nocow;
int check_prev = 1;
@@ -1403,11 +1403,8 @@ next_slot:
* if there are pending snapshots for this root,
* we fall into common COW way.
*/
- if (!nolock) {
- err = btrfs_start_write_no_snapshotting(root);
- if (!err)
- goto out_check;
- }
+ if (!nolock && atomic_read(&root->snapshot_force_cow))
+ goto out_check;
/*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
@@ -1416,9 +1413,6 @@ next_slot:
ret = csum_exist_in_range(fs_info, disk_bytenr,
num_bytes);
if (ret) {
- if (!nolock)
- btrfs_end_write_no_snapshotting(root);
-
/*
* ret could be -EIO if the above fails to read
* metadata.
@@ -1431,11 +1425,8 @@ next_slot:
WARN_ON_ONCE(nolock);
goto out_check;
}
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
- if (!nolock)
- btrfs_end_write_no_snapshotting(root);
+ if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
- }
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
@@ -1448,8 +1439,6 @@ next_slot:
out_check:
if (extent_end <= start) {
path->slots[0]++;
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
goto next_slot;
@@ -1471,8 +1460,6 @@ out_check:
end, page_started, nr_written, 1,
NULL);
if (ret) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
@@ -1492,8 +1479,6 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
@@ -1532,8 +1517,6 @@ out_check:
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_PRIVATE2);
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
cur_offset = extent_end;
/*
@@ -2767,12 +2750,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
struct btrfs_path *path;
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
- struct inode *inode;
struct rb_node *node;
int ret;
- inode = new->inode;
-
path = btrfs_alloc_path();
if (!path)
return;
@@ -2961,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
+ bool clear_reserved_extent = true;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3064,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- if (!ret)
+ if (!ret) {
+ clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
+ }
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -3128,8 +3111,13 @@ out:
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
*/
if ((ret || !logical_len) &&
+ clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(fs_info,
@@ -3488,8 +3476,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
- if (ret)
- goto out;
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3755,7 +3741,7 @@ cache_acl:
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
@@ -3927,12 +3913,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto err;
- }
- if (!di) {
- ret = -ENOENT;
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
leaf = path->nodes[0];
@@ -4092,10 +4074,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -4287,18 +4266,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* again is not run concurrently.
*/
spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- if (dest->send_in_progress == 0) {
- btrfs_set_root_flags(&dest->root_item,
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- } else {
+ if (dest->send_in_progress) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
return -EPERM;
}
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
down_write(&fs_info->subvol_sem);
@@ -4744,7 +4722,7 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (btrfs_should_throttle_delayed_refs(trans, fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
trans->transid, 0);
@@ -4753,8 +4731,7 @@ delete:
extent_num_bytes)) {
should_end = true;
}
- if (btrfs_should_throttle_delayed_refs(trans,
- fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
}
@@ -5252,10 +5229,10 @@ static void evict_inode_truncate_pages(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map)) {
+ while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
struct extent_map *em;
- node = rb_first(&map_tree->map);
+ node = rb_first_cached(&map_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -5291,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+ state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
@@ -5308,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
- if (state->state & EXTENT_DELALLOC)
+ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
@@ -5323,8 +5302,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- u64 min_size)
+ struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5334,7 +5312,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5351,8 +5329,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+ if (!btrfs_check_space_for_delayed_refs(trans) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
return trans;
/* If not, commit and try again. */
@@ -5368,7 +5346,6 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- u64 min_size;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5378,8 +5355,6 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
- min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
-
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
@@ -5390,9 +5365,6 @@ void btrfs_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
- /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- if (!special_file(inode->i_mode))
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
@@ -5412,13 +5384,13 @@ void btrfs_evict_inode(struct inode *inode)
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
- rsv->size = min_size;
+ rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1);
rsv->failfast = 1;
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
@@ -5443,7 +5415,7 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (!IS_ERR(trans)) {
trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -5488,12 +5460,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -5807,16 +5775,10 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct inode *inode;
-
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode)) {
- if (PTR_ERR(inode) == -ENOENT)
- inode = NULL;
- else
- return ERR_CAST(inode);
- }
+ struct inode *inode = btrfs_lookup_dentry(dir, dentry);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
return d_splice_alias(inode, dentry);
}
@@ -6407,8 +6369,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode, &key,
+ ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
@@ -6601,7 +6562,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->objectid != BTRFS_I(inode)->root->objectid)
+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6639,6 +6600,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
+ int ret;
+
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
@@ -6652,7 +6615,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+ ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
+ true, NULL);
+ if (ret == BTRFS_NEED_TRANS_COMMIT) {
+ err = btrfs_commit_transaction(trans);
+ trans = NULL;
+ }
}
fail:
@@ -6787,9 +6755,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* This also copies inline extents directly into the page.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
@@ -6833,19 +6801,21 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
+ path = btrfs_alloc_path();
if (!path) {
- path = btrfs_alloc_path();
- if (!path) {
- err = -ENOMEM;
- goto out;
- }
- /*
- * Chances are we'll be called again, so go ahead and do
- * readahead
- */
- path->reada = READA_FORWARD;
+ err = -ENOMEM;
+ goto out;
}
+ /* Chances are we'll be called again, so go ahead and do readahead */
+ path->reada = READA_FORWARD;
+
+ /*
+ * Unless we're going to uncompress the inline extent, no sleep would
+ * happen.
+ */
+ path->leave_spinning = 1;
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
err = ret;
@@ -6948,6 +6918,8 @@ next:
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+
+ btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
@@ -6995,10 +6967,10 @@ insert:
err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
+ btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- btrfs_free_path(path);
if (err) {
free_extent_map(em);
return ERR_PTR(err);
@@ -9031,7 +9003,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
/*
@@ -9068,7 +9040,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_block_rsv_release(fs_info, rsv, -1);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -9388,14 +9360,21 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_idx = 0;
u64 root_objectid;
int ret;
- int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ struct btrfs_log_ctx ctx_root;
+ struct btrfs_log_ctx ctx_dest;
+ bool sync_log_root = false;
+ bool sync_log_dest = false;
+ bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
+ btrfs_init_log_ctx(&ctx_root, old_inode);
+ btrfs_init_log_ctx(&ctx_dest, new_inode);
+
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
@@ -9542,15 +9521,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (root_log_pinned) {
parent = new_dentry->d_parent;
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- parent);
+ ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
+ BTRFS_I(old_dir), parent,
+ false, &ctx_root);
+ if (ret == BTRFS_NEED_LOG_SYNC)
+ sync_log_root = true;
+ else if (ret == BTRFS_NEED_TRANS_COMMIT)
+ commit_transaction = true;
+ ret = 0;
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- parent = old_dentry->d_parent;
- btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
- parent);
+ if (!commit_transaction) {
+ parent = old_dentry->d_parent;
+ ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
+ BTRFS_I(new_dir), parent,
+ false, &ctx_dest);
+ if (ret == BTRFS_NEED_LOG_SYNC)
+ sync_log_dest = true;
+ else if (ret == BTRFS_NEED_TRANS_COMMIT)
+ commit_transaction = true;
+ ret = 0;
+ }
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -9583,8 +9576,26 @@ out_fail:
dest_log_pinned = false;
}
}
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
+ if (!ret && sync_log_root && !commit_transaction) {
+ ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
+ &ctx_root);
+ if (ret)
+ commit_transaction = true;
+ }
+ if (!ret && sync_log_dest && !commit_transaction) {
+ ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
+ &ctx_dest);
+ if (ret)
+ commit_transaction = true;
+ }
+ if (commit_transaction) {
+ ret = btrfs_commit_transaction(trans);
+ } else {
+ int ret2;
+
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+ }
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9661,6 +9672,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int ret;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
+ struct btrfs_log_ctx ctx;
+ bool sync_log = false;
+ bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9818,8 +9832,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (log_pinned) {
struct dentry *parent = new_dentry->d_parent;
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- parent);
+ btrfs_init_log_ctx(&ctx, old_inode);
+ ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
+ BTRFS_I(old_dir), parent,
+ false, &ctx);
+ if (ret == BTRFS_NEED_LOG_SYNC)
+ sync_log = true;
+ else if (ret == BTRFS_NEED_TRANS_COMMIT)
+ commit_transaction = true;
+ ret = 0;
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9856,7 +9877,19 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- btrfs_end_transaction(trans);
+ if (!ret && sync_log) {
+ ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
+ if (ret)
+ commit_transaction = true;
+ }
+ if (commit_transaction) {
+ ret = btrfs_commit_transaction(trans);
+ } else {
+ int ret2;
+
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+ }
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -10140,7 +10173,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -10516,13 +10549,6 @@ static const struct address_space_operations btrfs_aops = {
.error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations btrfs_symlink_aops = {
- .readpage = btrfs_readpage,
- .writepage = btrfs_writepage,
- .invalidatepage = btrfs_invalidatepage,
- .releasepage = btrfs_releasepage,
-};
-
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 63600dc2ac4c..a990a9045139 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
@@ -686,8 +689,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- ret = btrfs_insert_dir_item(trans, root,
- name, namelen, BTRFS_I(dir), &key,
+ ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
BTRFS_FT_DIR, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -747,6 +749,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
+ bool snapshot_force_cow = false;
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
@@ -763,6 +766,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
+ /*
+ * Force new buffered writes to reserve space even when NOCOW is
+ * possible. This is to avoid later writeback (running dealloc) to
+ * fallback to COW mode and unexpectedly fail with ENOSPC.
+ */
atomic_inc(&root->will_be_snapshotted);
smp_mb__after_atomic();
/* wait for no snapshot writes */
@@ -773,6 +781,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
+ /*
+ * All previous writes have started writeback in NOCOW mode, so now
+ * we force future writes to fallback to COW mode during snapshot
+ * creation.
+ */
+ atomic_inc(&root->snapshot_force_cow);
+ snapshot_force_cow = true;
+
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
@@ -837,6 +853,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
fail:
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
+ if (snapshot_force_cow)
+ atomic_dec(&root->snapshot_force_cow);
if (atomic_dec_and_test(&root->will_be_snapshotted))
wake_up_var(&root->will_be_snapshotted);
free_pending:
@@ -1308,7 +1326,7 @@ again:
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
@@ -3453,6 +3471,25 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
same_lock_start = min_t(u64, loff, dst_loff);
same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+ } else {
+ /*
+ * If the source and destination inodes are different, the
+ * source's range end offset matches the source's i_size, that
+ * i_size is not a multiple of the sector size, and the
+ * destination range does not go past the destination's i_size,
+ * we must round down the length to the nearest sector size
+ * multiple. If we don't do this adjustment we end replacing
+ * with zeroes the bytes in the range that starts at the
+ * deduplication range's end offset and ends at the next sector
+ * size multiple.
+ */
+ if (loff + olen == i_size_read(src) &&
+ dst_loff + len < i_size_read(dst)) {
+ const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
+
+ len = round_down(i_size_read(src), sz) - loff;
+ olen = len;
+ }
}
again:
@@ -4358,7 +4395,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->objectid)) {
+ if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4353bb69bb86..45868fd76209 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1019,10 +1019,9 @@ out_add_root:
spin_unlock(&fs_info->qgroup_lock);
ret = btrfs_commit_transaction(trans);
- if (ret) {
- trans = NULL;
+ trans = NULL;
+ if (ret)
goto out_free_path;
- }
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
@@ -1417,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (!qgroup) {
ret = -ENOENT;
goto out;
- } else {
- /* check if there are no children of this qgroup */
- if (!list_empty(&qgroup->members)) {
- ret = -EBUSY;
- goto out;
- }
}
+
+ /* Check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
ret = del_qgroup_item(trans, qgroupid);
if (ret && ret != -ENOENT)
goto out;
@@ -1713,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
return 0;
}
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ * OO = Old tree blocks
+ * NN = New tree blocks allocated during balance
+ *
+ * File tree (257) Reloc tree for 257
+ * L2 OO NN
+ * / \ / \
+ * L1 OO OO (a) OO NN (a)
+ * / \ / \ / \ / \
+ * L0 OO OO OO OO OO OO NN NN
+ * (b) (c) (b) (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ * It should acts as a simplified btrfs_search_slot().
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
+ * (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ * They should be marked during preivous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ * We don't have good way to pick out new file extents only.
+ * So we still follow the old method by scanning all file extents in
+ * the leave.
+ *
+ * This function can free us from keeping two pathes, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int dst_level, int root_level,
+ bool trace_leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_path *src_path;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u32 nodesize = fs_info->nodesize;
+ int cur_level = root_level;
+ int ret;
+
+ BUG_ON(dst_level > root_level);
+ /* Level mismatch */
+ if (btrfs_header_level(src_eb) != root_level)
+ return -EINVAL;
+
+ src_path = btrfs_alloc_path();
+ if (!src_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dst_level)
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+ else
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+ /* For src_path */
+ extent_buffer_get(src_eb);
+ src_path->nodes[root_level] = src_eb;
+ src_path->slots[root_level] = dst_path->slots[root_level];
+ src_path->locks[root_level] = 0;
+
+ /* A simplified version of btrfs_search_slot() */
+ while (cur_level >= dst_level) {
+ struct btrfs_key src_key;
+ struct btrfs_key dst_key;
+
+ if (src_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ struct extent_buffer *eb;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ eb = src_path->nodes[cur_level + 1];
+ parent_slot = src_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ src_path->nodes[cur_level] = eb;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ }
+
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
+ if (cur_level) {
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ } else {
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ }
+ /* Content mismatch, something went wrong */
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ cur_level--;
+ }
+
+ /*
+ * Now both @dst_path and @src_path have been populated, record the tree
+ * blocks for qgroup accounting.
+ */
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_extent(trans,
+ dst_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ /* Record leaf file extents */
+ if (dst_level == 0 && trace_leaf) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+ }
+out:
+ btrfs_free_path(src_path);
+ return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ * reloc tree
+ * L2 NN (a)
+ * / \
+ * L1 OO NN (b)
+ * / \ / \
+ * L0 OO OO OO NN
+ * (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int cur_level, int root_level,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *eb;
+ bool need_cleanup = false;
+ int ret = 0;
+ int i;
+
+ /* Level sanity check */
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ root_level < cur_level) {
+ btrfs_err_rl(fs_info,
+ "%s: bad levels, cur_level=%d root_level=%d",
+ __func__, cur_level, root_level);
+ return -EUCLEAN;
+ }
+
+ /* Read the tree block if needed */
+ if (dst_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /*
+ * dst_path->nodes[root_level] must be initialized before
+ * calling this function.
+ */
+ if (cur_level == root_level) {
+ btrfs_err_rl(fs_info,
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+ __func__, root_level, root_level, cur_level);
+ return -EUCLEAN;
+ }
+
+ /*
+ * We need to get child blockptr/gen from parent before we can
+ * read it.
+ */
+ eb = dst_path->nodes[cur_level + 1];
+ parent_slot = dst_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ /* This node is old, no need to trace */
+ if (child_gen < last_snapshot)
+ goto out;
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ dst_path->nodes[cur_level] = eb;
+ dst_path->slots[cur_level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ need_cleanup = true;
+ }
+
+ /* Now record this tree block and its counter part for qgroups */
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+ root_level, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+
+ eb = dst_path->nodes[cur_level];
+
+ if (cur_level > 0) {
+ /* Iterate all child tree blocks */
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ /* Skip old tree blocks as they won't be swapped */
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+ continue;
+ dst_path->slots[cur_level] = i;
+
+ /* Recursive call (at most 7 times) */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+ dst_path, cur_level - 1, root_level,
+ last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (need_cleanup) {
+ /* Clean up */
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+ dst_path->locks[cur_level]);
+ free_extent_buffer(dst_path->nodes[cur_level]);
+ dst_path->nodes[cur_level] = NULL;
+ dst_path->slots[cur_level] = 0;
+ dst_path->locks[cur_level] = 0;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Inform qgroup to trace subtree swap used in balance.
+ *
+ * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
+ * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
+ *
+ * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
+ * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
+ * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
+ * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
+ * and skip all tree blocks whose generation is smaller than last_snapshot.
+ *
+ * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
+ * which could be the cause of very slow balance if the file tree is large.
+ *
+ * @src_parent, @src_slot: pointer to src (file tree) eb.
+ * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
+ */
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *dst_path = NULL;
+ struct btrfs_key first_key;
+ struct extent_buffer *src_eb = NULL;
+ struct extent_buffer *dst_eb = NULL;
+ bool trace_leaf = false;
+ u64 child_gen;
+ u64 child_bytenr;
+ int level;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /* Check parameter order */
+ if (btrfs_node_ptr_generation(src_parent, src_slot) >
+ btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+ btrfs_node_ptr_generation(src_parent, src_slot),
+ btrfs_node_ptr_generation(dst_parent, dst_slot));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Only trace leaf if we're relocating data block groups, this could
+ * reduce tons of data extents tracing for meta/sys bg relocation.
+ */
+ if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
+ trace_leaf = true;
+ /* Read out real @src_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
+ child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
+ btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
+
+ src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(src_parent) - 1, &first_key);
+ if (IS_ERR(src_eb)) {
+ ret = PTR_ERR(src_eb);
+ goto out;
+ }
+
+ /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
+ child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
+ btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
+
+ dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(dst_parent) - 1, &first_key);
+ if (IS_ERR(dst_eb)) {
+ ret = PTR_ERR(dst_eb);
+ goto out;
+ }
+
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ level = btrfs_header_level(dst_eb);
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* For dst_path */
+ extent_buffer_get(dst_eb);
+ dst_path->nodes[level] = dst_eb;
+ dst_path->slots[level] = 0;
+ dst_path->locks[level] = 0;
+
+ /* Do the generation-aware breadth-first search */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+ level, last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ free_extent_buffer(src_eb);
+ free_extent_buffer(dst_eb);
+ btrfs_free_path(dst_path);
+ if (ret < 0)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2133,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
+ u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -2142,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
@@ -2187,6 +2599,8 @@ cleanup:
kfree(record);
}
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+ num_dirty_extents);
return ret;
}
@@ -2898,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3005,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
- !is_fstree(root->objectid) || len == 0)
+ !is_fstree(root->root_key.objectid) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -3091,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
@@ -3107,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
+ &BTRFS_I(inode)->root->fs_info->flags))
+ return 0;
+
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
@@ -3123,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3216,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid) || num_bytes == 0)
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -3241,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -3257,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/*
@@ -3268,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
+ num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -3322,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->objectid, num_bytes);
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
}
/*
@@ -3355,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
inode->i_ino, unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 54b8bb282c0e..d8f78f5ab854 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -236,6 +236,12 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
+
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -249,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return;
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
BTRFS_QGROUP_RSV_DATA);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e5b9e596bb92..d69fbfb30aa9 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
INIT_LIST_HEAD(&ra->list);
ra->action = action;
- ra->root = root->objectid;
+ ra->root = root->root_key.objectid;
/*
* This is an allocation, preallocate the block_entry in case we haven't
@@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* one we want to lookup below when we modify the
* re->num_refs.
*/
- ref_root = root->objectid;
- re->root_objectid = root->objectid;
+ ref_root = root->root_key.objectid;
+ re->root_objectid = root->root_key.objectid;
re->num_refs = 0;
}
@@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* didn't thik of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
- root->objectid, be->bytenr);
+ root->root_key.objectid, be->bytenr);
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ra);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8783a1776540..924116f654a1 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
int level, u64 bytenr)
{
struct backref_cache *cache = &rc->backref_cache;
- struct btrfs_path *path1;
- struct btrfs_path *path2;
+ struct btrfs_path *path1; /* For searching extent root */
+ struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
struct extent_buffer *eb;
struct btrfs_root *root;
struct backref_node *cur;
@@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
struct btrfs_key key;
unsigned long end;
unsigned long ptr;
- LIST_HEAD(list);
+ LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
LIST_HEAD(useless);
int cowonly;
int ret;
@@ -778,6 +778,10 @@ again:
key.type != BTRFS_SHARED_BLOCK_REF_KEY);
}
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref.
+ */
if (exist &&
((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
exist->owner == key.offset) ||
@@ -787,11 +791,12 @@ again:
goto next;
}
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
if (key.objectid == key.offset) {
/*
- * only root blocks of reloc trees use
- * backref of this type.
+ * Only root blocks of reloc trees use backref
+ * pointing to itself.
*/
root = find_reloc_root(rc, cur->bytenr);
ASSERT(root);
@@ -840,7 +845,11 @@ again:
goto next;
}
- /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -863,10 +872,7 @@ again:
level = cur->level + 1;
- /*
- * searching the tree to find upper level blocks
- * reference the block.
- */
+ /* Search the tree to find parent blocks referring the block. */
path2->search_commit_root = 1;
path2->skip_locking = 1;
path2->lowest_level = level;
@@ -884,7 +890,8 @@ again:
cur->bytenr) {
btrfs_err(root->fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->objectid,
+ cur->bytenr, level - 1,
+ root->root_key.objectid,
node_key->objectid, node_key->type,
node_key->offset);
err = -ENOENT;
@@ -892,6 +899,8 @@ again:
}
lower = cur;
need_check = true;
+
+ /* Add all nodes and edges in the path */
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
ASSERT(btrfs_root_bytenr(&root->root_item) ==
@@ -1281,7 +1290,7 @@ static void __del_reloc_root(struct btrfs_root *root)
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
- if (rc) {
+ if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
root->node->start);
@@ -1735,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* errors, a negative error number is returned.
*/
static noinline_for_stack
-int replace_path(struct btrfs_trans_handle *trans,
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_root *dest, struct btrfs_root *src,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
@@ -1879,14 +1888,9 @@ again:
* and tree block numbers, if current trans doesn't free
* data reloc tree inode.
*/
- ret = btrfs_qgroup_trace_subtree(trans, parent,
- btrfs_header_generation(parent),
- btrfs_header_level(parent));
- if (ret < 0)
- break;
- ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level],
- btrfs_header_generation(path->nodes[level]),
- btrfs_header_level(path->nodes[level]));
+ ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
+ parent, slot, path->nodes[level],
+ path->slots[level], last_snapshot);
if (ret < 0)
break;
@@ -2205,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
} else {
- ret = replace_path(trans, root, reloc_root, path,
+ ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
if (ret < 0) {
@@ -2911,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- WARN_ON(btrfs_header_level(eb) != block->level);
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
else
@@ -2987,7 +2990,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
- struct rb_node *rb_node;
+ struct tree_block *next;
int ret;
int err = 0;
@@ -2997,29 +3000,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
goto out_free_blocks;
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Kick in readahead for tree blocks with missing keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
readahead_tree_block(fs_info, block->bytenr);
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Get first keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
err = get_tree_block_key(fs_info, block);
if (err)
goto out_free_path;
}
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
-
+ /* Do tree relocation */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@@ -3030,11 +3027,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
err = ret;
goto out;
}
- rb_node = rb_next(rb_node);
}
out:
err = finish_pending_nodes(trans, rc, path, err);
@@ -4669,7 +4665,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated, 1);
+ rc->nodes_relocated, true);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..902819d3cf41 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
- btrfs_dev_replace_stats_inc(
+ atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
@@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
if (btrfsic_submit_bio_wait(bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
@@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
@@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
- btrfs_dev_replace_stats_inc(&dev_replace->
- num_write_errors);
+ atomic64_inc(&dev_replace->num_write_errors);
}
}
@@ -3022,8 +3019,7 @@ out:
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length,
- int is_dev_replace)
+ int num, u64 base, u64 length)
{
struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3299,7 +3295,7 @@ again:
extent_physical = extent_logical - logical + physical;
extent_dev = scrub_dev;
extent_mirror_num = mirror_num;
- if (is_dev_replace)
+ if (sctx->is_dev_replace)
scrub_remap_extent(fs_info, extent_logical,
extent_len, &extent_physical,
&extent_dev,
@@ -3397,8 +3393,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache,
- int is_dev_replace)
+ struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
@@ -3435,8 +3430,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length,
- is_dev_replace);
+ chunk_offset, length);
if (ret)
goto out;
}
@@ -3449,8 +3443,7 @@ out:
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
- struct btrfs_device *scrub_dev, u64 start, u64 end,
- int is_dev_replace)
+ struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
@@ -3544,7 +3537,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(cache);
- if (!ret && is_dev_replace) {
+ if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
* that started dellaloc right before we set the block
@@ -3609,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache, is_dev_replace);
+ found_key.offset, cache);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3670,7 +3663,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_put_block_group(cache);
if (ret)
break;
- if (is_dev_replace &&
+ if (sctx->is_dev_replace &&
atomic64_read(&dev_replace->num_write_errors) > 0) {
ret = -EIO;
break;
@@ -3893,8 +3886,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
if (!ret)
- ret = scrub_enumerate_chunks(sctx, dev, start, end,
- is_dev_replace);
+ ret = scrub_enumerate_chunks(sctx, dev, start, end);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ba8950bfd9c7..094cc1444a90 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
struct clone_root *cr = (struct clone_root *)elt;
- if (root < cr->root->objectid)
+ if (root < cr->root->root_key.objectid)
return -1;
- if (root > cr->root->objectid)
+ if (root > cr->root->root_key.objectid)
return 1;
return 0;
}
@@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
struct clone_root *cr1 = (struct clone_root *)e1;
struct clone_root *cr2 = (struct clone_root *)e2;
- if (cr1->root->objectid < cr2->root->objectid)
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
- if (cr1->root->objectid > cr2->root->objectid)
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
return 1;
return 0;
}
@@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
di = btrfs_lookup_dir_item(NULL, root, path,
dir, name, name_len, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
@@ -2346,7 +2342,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->objectid;
+ key.objectid = send_root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2362,7 +2358,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->objectid) {
+ key.objectid != send_root->root_key.objectid) {
ret = -ENOENT;
goto out;
}
@@ -4907,8 +4903,8 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->objectid, clone_root->ino,
- clone_root->offset);
+ offset, len, clone_root->root->root_key.objectid,
+ clone_root->ino, clone_root->offset);
p = fs_path_alloc();
if (!p)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6601c9aa5e35..b362b45dd757 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
- buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+ buf->f_fsid.val[0] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
+ buf->f_fsid.val[1] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid;
return 0;
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d9269a531a4d..9e0f4a01be14 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("should have found at least one delalloc");
@@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (found) {
test_err("found range when we shouldn't have");
@@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 385a5316e4bf..bf15d3a7f20e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
- while (!RB_EMPTY_ROOT(&em_tree->map)) {
- node = rb_first(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+ node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
remove_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..d1eeef9ec5da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.href_root.rb_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -118,7 +119,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->objectid))
+ if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
}
@@ -245,7 +246,7 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -759,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (btrfs_check_space_for_delayed_refs(trans, fs_info))
+ if (btrfs_check_space_for_delayed_refs(trans))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -834,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans, info);
+ btrfs_should_throttle_delayed_refs(trans);
cur = max_t(unsigned long, cur, 32);
/*
@@ -1197,7 +1198,10 @@ again:
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
- btrfs_after_dev_replace_commit(fs_info);
+
+ /* Update dev-replace pointer once everything is committed */
+ fs_info->dev_replace.committed_cursor_left =
+ fs_info->dev_replace.cursor_left_last_write_of_item;
return 0;
}
@@ -1613,10 +1617,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
- ret = btrfs_insert_dir_item(trans, parent_root,
- dentry->d_name.name, dentry->d_name.len,
- BTRFS_I(parent_inode), &key,
- BTRFS_FT_DIR, index);
+ ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
+ dentry->d_name.len, BTRFS_I(parent_inode),
+ &key, BTRFS_FT_DIR, index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@@ -1929,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
@@ -1938,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
cur_trans = trans->transaction;
/*
@@ -2280,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- /*
- * If fs has been frozen, we can not handle delayed iputs, otherwise
- * it'll result in deadlock about SB_FREEZE_FS.
- */
- if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread &&
- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
- btrfs_run_delayed_iputs(fs_info);
-
return ret;
scrub_continue:
@@ -2330,7 +2324,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index db835635372f..cab0b1f1f741 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
u32 nritems = btrfs_header_nritems(leaf);
int slot;
+ if (btrfs_header_level(leaf) != 0) {
+ generic_err(fs_info, leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
int slot;
+ int level = btrfs_header_level(node);
u64 bytenr;
int ret = 0;
+ if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ generic_err(fs_info, node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1650dc44a5e3..e07f3376b7df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root)
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -258,6 +255,13 @@ struct walk_control {
/* what stage of the replay code we're currently in */
int stage;
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
@@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
@@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root->fs_info->sectorsize);
ret = btrfs_drop_extents(wc->trans, root, inode,
from, (u64)-1, 1);
- /*
- * If the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done.
- */
if (!ret) {
- if (inode->i_nlink == 0)
- inc_nlink(inode);
- /* Update link count and nbytes. */
+ /* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, inode);
}
@@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
}
+ if (wc->ignore_cur_inode)
+ continue;
+
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
@@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
};
ret = walk_log_tree(trans, log, &wc);
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -4374,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
logged_start = start;
@@ -4440,7 +4455,6 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4636,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
@@ -5564,9 +5579,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (ctx)
ctx->log_new_dentries = false;
@@ -5640,7 +5679,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (btrfs_inode_in_log(inode, trans->transid)) {
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if (btrfs_inode_in_log(inode, trans->transid) ||
+ inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
@@ -6025,14 +6070,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
*
- * It will return zero if all goes well, and it will return 1 if a
- * full transaction commit is required.
+ * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
+ * true (because it's not used).
+ *
+ * Return value depends on whether @sync_log is true or false.
+ * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
+ * otherwise.
+ * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
+ * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
+ * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed (without attempting to sync the log).
*/
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *parent,
+ bool sync_log, struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -6047,9 +6103,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return 0;
+ return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
+ BTRFS_DONT_NEED_LOG_SYNC;
+
+ if (sync_log) {
+ struct btrfs_log_ctx ctx2;
+
+ btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx2);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+
+ ret = btrfs_sync_log(trans, inode->root, &ctx2);
+ if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ }
+
+ ASSERT(ctx);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, ctx);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_LOG_SYNC;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
- return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, NULL);
+ return BTRFS_NEED_LOG_SYNC;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 122e68b89a5a..767765031e59 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,14 +65,22 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
-int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
+/* Return values for btrfs_log_new_name() */
+enum {
+ BTRFS_DONT_NEED_TRANS_COMMIT,
+ BTRFS_NEED_TRANS_COMMIT,
+ BTRFS_DONT_NEED_LOG_SYNC,
+ BTRFS_NEED_LOG_SYNC,
+};
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent);
+ struct dentry *parent,
+ bool sync_log, struct btrfs_log_ctx *ctx);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da86706123ff..f435d397019e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map);
+ n = rb_last(&em_tree->map.rb_root);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
@@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
fs_info->fs_devices->latest_bdev = next_device->bdev;
}
+/*
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
+ * currently replaced.
+ */
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
+{
+ u64 num_devices = fs_info->fs_devices->num_devices;
+
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ ASSERT(num_devices > 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+
+ return num_devices;
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid)
{
@@ -1865,22 +1883,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_lock(&uuid_mutex);
- num_devices = fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- WARN_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
- &device);
- if (ret)
+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+
+ if (IS_ERR(device)) {
+ if (PTR_ERR(device) == -ENOENT &&
+ strcmp(device_path, "missing") == 0)
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+ else
+ ret = PTR_ERR(device);
goto out;
+ }
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
@@ -2096,9 +2114,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
call_rcu(&tgtdev->rcu, free_device_rcu);
}
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
int ret = 0;
struct btrfs_super_block *disk_super;
@@ -2106,28 +2123,27 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
+ struct btrfs_device *device;
- *device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
- return ret;
+ return ERR_PTR(ret);
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
brelse(bh);
- if (!*device)
- ret = -ENOENT;
+ if (!device)
+ device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return ret;
+ return device;
}
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_missing_or_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
- *device = NULL;
+ struct btrfs_device *device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
@@ -2136,42 +2152,38 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
- *device = tmp;
+ device = tmp;
break;
}
}
- if (!*device)
- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-
- return 0;
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
- return btrfs_find_device_by_path(fs_info, device_path, device);
+ device = btrfs_find_device_by_path(fs_info, device_path);
}
+
+ return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device)
+struct btrfs_device *btrfs_find_device_by_devspec(
+ struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
{
- int ret;
+ struct btrfs_device *device;
if (devid) {
- ret = 0;
- *device = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!*device)
- ret = -ENOENT;
+ device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
if (!devpath || !devpath[0])
- return -EINVAL;
-
- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
- device);
+ return ERR_PTR(-EINVAL);
+ device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
}
- return ret;
+ return device;
}
/*
@@ -3679,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return (flags & (flags - 1)) == 0;
+ return is_power_of_2(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- BUG_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
+
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -4491,7 +4498,12 @@ again:
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
- btrfs_end_transaction(trans);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ } else {
+ ret = btrfs_commit_transaction(trans);
+ }
done:
btrfs_free_path(path);
if (ret) {
@@ -5892,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
out:
if (dev_replace_is_ongoing) {
- btrfs_dev_replace_clear_lock_blocking(dev_replace);
+ ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
+ btrfs_dev_replace_read_lock(dev_replace);
+ /* Barrier implied by atomic_dec_and_test */
+ if (atomic_dec_and_test(&dev_replace->blocking_readers))
+ cond_wake_up_nomb(&dev_replace->read_lock_wq);
btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
@@ -7433,7 +7449,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
int ret = 0;
read_lock(&em_tree->lock);
- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
em = rb_entry(node, struct extent_map, rb_node);
if (em->map_lookup->num_stripes !=
em->map_lookup->verified_stripes) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 23e9285d88de..aefce895e994 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -410,12 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device);
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device);
+struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
+ u64 devid,
+ const char *devpath);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
diff --git a/fs/buffer.c b/fs/buffer.c
index 4cc679d5bf58..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -39,7 +39,6 @@
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
-#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
@@ -563,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
@@ -580,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping,
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -1051,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
@@ -1074,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty