aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c100
-rw-r--r--fs/9p/v9fs.c7
-rw-r--r--fs/9p/vfs_inode.c61
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/adfs/dir.c24
-rw-r--r--fs/affs/namei.c15
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/afs/Makefile2
-rw-r--r--fs/afs/addr_list.c31
-rw-r--r--fs/afs/afs.h27
-rw-r--r--fs/afs/afs_fs.h2
-rw-r--r--fs/afs/cache.c150
-rw-r--r--fs/afs/callback.c113
-rw-r--r--fs/afs/cell.c18
-rw-r--r--fs/afs/cmservice.c89
-rw-r--r--fs/afs/dir.c939
-rw-r--r--fs/afs/dir_edit.c505
-rw-r--r--fs/afs/dynroot.c209
-rw-r--r--fs/afs/file.c35
-rw-r--r--fs/afs/flock.c8
-rw-r--r--fs/afs/fsclient.c635
-rw-r--r--fs/afs/inode.c131
-rw-r--r--fs/afs/internal.h137
-rw-r--r--fs/afs/main.c44
-rw-r--r--fs/afs/proc.c416
-rw-r--r--fs/afs/rotate.c22
-rw-r--r--fs/afs/rxrpc.c27
-rw-r--r--fs/afs/security.c28
-rw-r--r--fs/afs/server.c44
-rw-r--r--fs/afs/server_list.c7
-rw-r--r--fs/afs/super.c15
-rw-r--r--fs/afs/vlclient.c47
-rw-r--r--fs/afs/volume.c6
-rw-r--r--fs/afs/write.c52
-rw-r--r--fs/afs/xdr_fs.h103
-rw-r--r--fs/aio.c736
-rw-r--r--fs/attr.c36
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/autofs4/waitq.c29
-rw-r--r--fs/befs/linuxvfs.c17
-rw-r--r--fs/bfs/dir.c43
-rw-r--r--fs/binfmt_aout.c1
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/binfmt_elf_fdpic.c1
-rw-r--r--fs/binfmt_flat.c1
-rw-r--r--fs/block_dev.c35
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/acl.c15
-rw-r--r--fs/btrfs/async-thread.c15
-rw-r--r--fs/btrfs/async-thread.h21
-rw-r--r--fs/btrfs/backref.c15
-rw-r--r--fs/btrfs/backref.h19
-rw-r--r--fs/btrfs/btrfs_inode.h41
-rw-r--r--fs/btrfs/check-integrity.c15
-rw-r--r--fs/btrfs/check-integrity.h19
-rw-r--r--fs/btrfs/compression.c24
-rw-r--r--fs/btrfs/compression.h21
-rw-r--r--fs/btrfs/ctree.c160
-rw-r--r--fs/btrfs/ctree.h123
-rw-r--r--fs/btrfs/dedupe.h20
-rw-r--r--fs/btrfs/delayed-inode.c44
-rw-r--r--fs/btrfs/delayed-inode.h19
-rw-r--r--fs/btrfs/delayed-ref.c285
-rw-r--r--fs/btrfs/delayed-ref.h27
-rw-r--r--fs/btrfs/dev-replace.c166
-rw-r--r--fs/btrfs/dev-replace.h20
-rw-r--r--fs/btrfs/dir-item.c15
-rw-r--r--fs/btrfs/disk-io.c444
-rw-r--r--fs/btrfs/disk-io.h20
-rw-r--r--fs/btrfs/export.c1
-rw-r--r--fs/btrfs/export.h1
-rw-r--r--fs/btrfs/extent-tree.c350
-rw-r--r--fs/btrfs/extent_io.c104
-rw-r--r--fs/btrfs/extent_io.h26
-rw-r--r--fs/btrfs/extent_map.c7
-rw-r--r--fs/btrfs/extent_map.h9
-rw-r--r--fs/btrfs/file-item.c15
-rw-r--r--fs/btrfs/file.c17
-rw-r--r--fs/btrfs/free-space-cache.c21
-rw-r--r--fs/btrfs/free-space-cache.h19
-rw-r--r--fs/btrfs/free-space-tree.c207
-rw-r--r--fs/btrfs/free-space-tree.h27
-rw-r--r--fs/btrfs/inode-item.c15
-rw-r--r--fs/btrfs/inode-map.c15
-rw-r--r--fs/btrfs/inode-map.h5
-rw-r--r--fs/btrfs/inode.c1412
-rw-r--r--fs/btrfs/ioctl.c1147
-rw-r--r--fs/btrfs/locking.c50
-rw-r--r--fs/btrfs/locking.h19
-rw-r--r--fs/btrfs/lzo.c91
-rw-r--r--fs/btrfs/math.h20
-rw-r--r--fs/btrfs/ordered-data.c29
-rw-r--r--fs/btrfs/ordered-data.h20
-rw-r--r--fs/btrfs/orphan.c15
-rw-r--r--fs/btrfs/print-tree.c61
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/props.c27
-rw-r--r--fs/btrfs/props.h19
-rw-r--r--fs/btrfs/qgroup.c127
-rw-r--r--fs/btrfs/qgroup.h22
-rw-r--r--fs/btrfs/raid56.c54
-rw-r--r--fs/btrfs/raid56.h21
-rw-r--r--fs/btrfs/rcu-string.h20
-rw-r--r--fs/btrfs/reada.c15
-rw-r--r--fs/btrfs/ref-verify.c15
-rw-r--r--fs/btrfs/ref-verify.h23
-rw-r--r--fs/btrfs/relocation.c25
-rw-r--r--fs/btrfs/root-tree.c15
-rw-r--r--fs/btrfs/scrub.c16
-rw-r--r--fs/btrfs/send.c65
-rw-r--r--fs/btrfs/send.h20
-rw-r--r--fs/btrfs/struct-funcs.c15
-rw-r--r--fs/btrfs/super.c22
-rw-r--r--fs/btrfs/sysfs.c67
-rw-r--r--fs/btrfs/sysfs.h11
-rw-r--r--fs/btrfs/tests/btrfs-tests.c19
-rw-r--r--fs/btrfs/tests/btrfs-tests.h25
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c71
-rw-r--r--fs/btrfs/tests/extent-io-tests.c90
-rw-r--r--fs/btrfs/tests/extent-map-tests.c105
-rw-r--r--fs/btrfs/tests/free-space-tests.c192
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c144
-rw-r--r--fs/btrfs/tests/inode-tests.c327
-rw-r--r--fs/btrfs/tests/qgroup-tests.c115
-rw-r--r--fs/btrfs/transaction.c31
-rw-r--r--fs/btrfs/transaction.h35
-rw-r--r--fs/btrfs/tree-checker.c13
-rw-r--r--fs/btrfs/tree-checker.h17
-rw-r--r--fs/btrfs/tree-defrag.c15
-rw-r--r--fs/btrfs/tree-log.c211
-rw-r--r--fs/btrfs/tree-log.h20
-rw-r--r--fs/btrfs/ulist.c2
-rw-r--r--fs/btrfs/ulist.h7
-rw-r--r--fs/btrfs/uuid-tree.c26
-rw-r--r--fs/btrfs/volumes.c531
-rw-r--r--fs/btrfs/volumes.h43
-rw-r--r--fs/btrfs/xattr.c16
-rw-r--r--fs/btrfs/xattr.h21
-rw-r--r--fs/btrfs/zlib.c15
-rw-r--r--fs/btrfs/zstd.c10
-rw-r--r--fs/buffer.c157
-rw-r--r--fs/cachefiles/interface.c61
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/main.c1
-rw-r--r--fs/cachefiles/namei.c85
-rw-r--r--fs/cachefiles/proc.c19
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/xattr.c8
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c63
-rw-r--r--fs/ceph/cache.c117
-rw-r--r--fs/ceph/caps.c128
-rw-r--r--fs/ceph/debugfs.c8
-rw-r--r--fs/ceph/dir.c205
-rw-r--r--fs/ceph/file.c322
-rw-r--r--fs/ceph/inode.c36
-rw-r--r--fs/ceph/ioctl.c13
-rw-r--r--fs/ceph/locks.c20
-rw-r--r--fs/ceph/mds_client.c87
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/quota.c361
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c50
-rw-r--r--fs/ceph/super.h42
-rw-r--r--fs/ceph/xattr.c70
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/Makefile7
-rw-r--r--fs/cifs/cache.c168
-rw-r--r--fs/cifs/cifs_debug.c75
-rw-r--r--fs/cifs/cifs_debug.h36
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c61
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h51
-rw-r--r--fs/cifs/cifsproto.h16
-rw-r--r--fs/cifs/cifssmb.c37
-rw-r--r--fs/cifs/connect.c135
-rw-r--r--fs/cifs/dir.c49
-rw-r--r--fs/cifs/file.c86
-rw-r--r--fs/cifs/fscache.c130
-rw-r--r--fs/cifs/fscache.h13
-rw-r--r--fs/cifs/inode.c15
-rw-r--r--fs/cifs/misc.c7
-rw-r--r--fs/cifs/netmisc.c2
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/smb2glob.h5
-rw-r--r--fs/cifs/smb2inode.c43
-rw-r--r--fs/cifs/smb2maperror.c11
-rw-r--r--fs/cifs/smb2misc.c154
-rw-r--r--fs/cifs/smb2ops.c350
-rw-r--r--fs/cifs/smb2pdu.c545
-rw-r--r--fs/cifs/smb2pdu.h96
-rw-r--r--fs/cifs/smb2proto.h11
-rw-r--r--fs/cifs/smb2transport.c12
-rw-r--r--fs/cifs/smbdirect.c42
-rw-r--r--fs/cifs/trace.c18
-rw-r--r--fs/cifs/trace.h429
-rw-r--r--fs/cifs/transport.c15
-rw-r--r--fs/cramfs/inode.c7
-rw-r--r--fs/crypto/crypto.c47
-rw-r--r--fs/crypto/fname.c32
-rw-r--r--fs/crypto/fscrypt_private.h23
-rw-r--r--fs/crypto/hooks.c5
-rw-r--r--fs/crypto/keyinfo.c286
-rw-r--r--fs/dax.c272
-rw-r--r--fs/dcache.c267
-rw-r--r--fs/debugfs/file.c10
-rw-r--r--fs/debugfs/inode.c4
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/lowcomms.c16
-rw-r--r--fs/ecryptfs/crypto.c41
-rw-r--r--fs/ecryptfs/file.c21
-rw-r--r--fs/ecryptfs/inode.c6
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/eventfd.c15
-rw-r--r--fs/eventpoll.c5
-rw-r--r--fs/exec.c33
-rw-r--r--fs/exofs/ore.c10
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/exportfs/expfs.c9
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/inode.c56
-rw-r--r--fs/ext2/namei.c24
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/balloc.c42
-rw-r--r--fs/ext4/ext4.h9
-rw-r--r--fs/ext4/extents.c16
-rw-r--r--fs/ext4/extents_status.c3
-rw-r--r--fs/ext4/file.c93
-rw-r--r--fs/ext4/fsmap.c4
-rw-r--r--fs/ext4/ialloc.c35
-rw-r--r--fs/ext4/indirect.c14
-rw-r--r--fs/ext4/inline.c10
-rw-r--r--fs/ext4/inode.c123
-rw-r--r--fs/ext4/mballoc.c52
-rw-r--r--fs/ext4/namei.c6
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c80
-rw-r--r--fs/ext4/sysfs.c49
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/dir.c6
-rw-r--r--fs/f2fs/gc.c2
-rw-r--r--fs/f2fs/inline.c6
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/node.c8
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/f2fs/sysfs.c29
-rw-r--r--fs/fat/namei_msdos.c4
-rw-r--r--fs/fat/namei_vfat.c13
-rw-r--r--fs/fcntl.c15
-rw-r--r--fs/filesystems.c14
-rw-r--r--fs/freevxfs/vxfs_lookup.c8
-rw-r--r--fs/fs-writeback.c40
-rw-r--r--fs/fscache/cache.c2
-rw-r--r--fs/fscache/cookie.c388
-rw-r--r--fs/fscache/fsdef.c55
-rw-r--r--fs/fscache/histogram.c17
-rw-r--r--fs/fscache/internal.h49
-rw-r--r--fs/fscache/main.c1
-rw-r--r--fs/fscache/netfs.c71
-rw-r--r--fs/fscache/object-list.c28
-rw-r--r--fs/fscache/object.c68
-rw-r--r--fs/fscache/operation.c26
-rw-r--r--fs/fscache/page.c84
-rw-r--r--fs/fscache/proc.c8
-rw-r--r--fs/fscache/stats.c18
-rw-r--r--fs/fuse/inode.c3
-rw-r--r--fs/gfs2/aops.c69
-rw-r--r--fs/gfs2/bmap.c441
-rw-r--r--fs/gfs2/bmap.h6
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c47
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/log.h7
-rw-r--r--fs/gfs2/ops_fstype.c21
-rw-r--r--fs/gfs2/quota.c5
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/trans.c27
-rw-r--r--fs/gfs2/xattr.c8
-rw-r--r--fs/hfs/dir.c20
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfsplus/dir.c3
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/inode.c12
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/iomap.c408
-rw-r--r--fs/isofs/compress.c19
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd2/journal.c20
-rw-r--r--fs/jbd2/revoke.c12
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/dir.c12
-rw-r--r--fs/jffs2/erase.c37
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/jfs_debug.c43
-rw-r--r--fs/jfs/jfs_debug.h10
-rw-r--r--fs/jfs/jfs_logmgr.c14
-rw-r--r--fs/jfs/jfs_metapage.c14
-rw-r--r--fs/jfs/jfs_txnmgr.c28
-rw-r--r--fs/jfs/jfs_xtree.c14
-rw-r--r--fs/jfs/namei.c12
-rw-r--r--fs/kernfs/file.c8
-rw-r--r--fs/kernfs/mount.c1
-rw-r--r--fs/libfs.c39
-rw-r--r--fs/locks.c16
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/namei.c159
-rw-r--r--fs/namespace.c9
-rw-r--r--fs/nfs/callback_xdr.c37
-rw-r--r--fs/nfs/client.c43
-rw-r--r--fs/nfs/delegation.c52
-rw-r--r--fs/nfs/delegation.h7
-rw-r--r--fs/nfs/dir.c15
-rw-r--r--fs/nfs/fscache-index.c159
-rw-r--r--fs/nfs/fscache.c89
-rw-r--r--fs/nfs/fscache.h15
-rw-r--r--fs/nfs/inode.c138
-rw-r--r--fs/nfs/nfs3proc.c24
-rw-r--r--fs/nfs/nfs3xdr.c7
-rw-r--r--fs/nfs/nfs4proc.c168
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/nfs4xdr.c246
-rw-r--r--fs/nfs/proc.c19
-rw-r--r--fs/nfs/unlink.c7
-rw-r--r--fs/nfs/write.c8
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/vfs.c22
-rw-r--r--fs/nilfs2/btnode.c20
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/page.c22
-rw-r--r--fs/notify/fanotify/fanotify.c34
-rw-r--r--fs/notify/fsnotify.c25
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmast.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h25
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c25
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c41
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c16
-rw-r--r--fs/ocfs2/filecheck.c358
-rw-r--r--fs/ocfs2/filecheck.h29
-rw-r--r--fs/ocfs2/inode.c8
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h6
-rw-r--r--fs/ocfs2/refcounttree.c24
-rw-r--r--fs/ocfs2/suballoc.c53
-rw-r--r--fs/ocfs2/super.c49
-rw-r--r--fs/ocfs2/uptodate.c3
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/dir.c7
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/orangefs/acl.c1
-rw-r--r--fs/orangefs/devorangefs-req.c55
-rw-r--r--fs/orangefs/file.c125
-rw-r--r--fs/orangefs/inode.c4
-rw-r--r--fs/orangefs/namei.c73
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/orangefs/orangefs-debug.h6
-rw-r--r--fs/orangefs/orangefs-kernel.h80
-rw-r--r--fs/orangefs/protocol.h45
-rw-r--r--fs/orangefs/super.c5
-rw-r--r--fs/overlayfs/Kconfig17
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/overlayfs/export.c75
-rw-r--r--fs/overlayfs/inode.c188
-rw-r--r--fs/overlayfs/namei.c67
-rw-r--r--fs/overlayfs/overlayfs.h19
-rw-r--r--fs/overlayfs/ovl_entry.h21
-rw-r--r--fs/overlayfs/readdir.c45
-rw-r--r--fs/overlayfs/super.c157
-rw-r--r--fs/overlayfs/util.c39
-rw-r--r--fs/pipe.c22
-rw-r--r--fs/proc/array.c118
-rw-r--r--fs/proc/base.c194
-rw-r--r--fs/proc/cmdline.c17
-rw-r--r--fs/proc/consoles.c14
-rw-r--r--fs/proc/devices.c14
-rw-r--r--fs/proc/fd.c138
-rw-r--r--fs/proc/generic.c252
-rw-r--r--fs/proc/inode.c67
-rw-r--r--fs/proc/internal.h37
-rw-r--r--fs/proc/interrupts.c14
-rw-r--r--fs/proc/kcore.c23
-rw-r--r--fs/proc/loadavg.c16
-rw-r--r--fs/proc/meminfo.c29
-rw-r--r--fs/proc/namespaces.c24
-rw-r--r--fs/proc/nommu.c14
-rw-r--r--fs/proc/proc_net.c113
-rw-r--r--fs/proc/proc_sysctl.c29
-rw-r--r--fs/proc/proc_tty.c22
-rw-r--r--fs/proc/root.c21
-rw-r--r--fs/proc/self.c4
-rw-r--r--fs/proc/softirqs.c14
-rw-r--r--fs/proc/task_mmu.c163
-rw-r--r--fs/proc/thread_self.c4
-rw-r--r--fs/proc/uptime.c14
-rw-r--r--fs/proc/version.c14
-rw-r--r--fs/pstore/Kconfig101
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/pstore/internal.h3
-rw-r--r--fs/pstore/platform.c372
-rw-r--r--fs/pstore/ram.c2
-rw-r--r--fs/pstore/ram_core.c29
-rw-r--r--fs/qnx4/namei.c8
-rw-r--r--fs/qnx6/namei.c8
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c6
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/namei.c12
-rw-r--r--fs/reiserfs/procfs.c16
-rw-r--r--fs/romfs/super.c36
-rw-r--r--fs/select.c85
-rw-r--r--fs/seq_file.c129
-rw-r--r--fs/signalfd.c93
-rw-r--r--fs/super.c144
-rw-r--r--fs/sysfs/mount.c6
-rw-r--r--fs/sysv/namei.c9
-rw-r--r--fs/timerfd.c22
-rw-r--r--fs/ubifs/crypto.c10
-rw-r--r--fs/ubifs/dir.c43
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/find.c2
-rw-r--r--fs/ubifs/lprops.c4
-rw-r--r--fs/ubifs/scan.c1
-rw-r--r--fs/ubifs/super.c14
-rw-r--r--fs/udf/namei.c6
-rw-r--r--fs/udf/unicode.c6
-rw-r--r--fs/ufs/namei.c6
-rw-r--r--fs/xattr.c4
-rw-r--r--fs/xfs/Kconfig18
-rw-r--r--fs/xfs/Makefile9
-rw-r--r--fs/xfs/libxfs/xfs_ag.c464
-rw-r--r--fs/xfs/libxfs/xfs_ag.h30
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c135
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h28
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c13
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c99
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h32
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_btree.h7
-rw-r--r--fs/xfs/libxfs/xfs_defer.c24
-rw-r--r--fs/xfs/libxfs/xfs_defer.h1
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c94
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_fs.h9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c21
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h9
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c46
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h7
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c86
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c26
-rw-r--r--fs/xfs/libxfs/xfs_sb.c157
-rw-r--r--fs/xfs/libxfs/xfs_sb.h16
-rw-r--r--fs/xfs/libxfs/xfs_shared.h16
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c10
-rw-r--r--fs/xfs/libxfs/xfs_types.h2
-rw-r--r--fs/xfs/scrub/agheader.c85
-rw-r--r--fs/xfs/scrub/agheader_repair.c70
-rw-r--r--fs/xfs/scrub/alloc.c4
-rw-r--r--fs/xfs/scrub/attr.c3
-rw-r--r--fs/xfs/scrub/bmap.c8
-rw-r--r--fs/xfs/scrub/btree.c42
-rw-r--r--fs/xfs/scrub/common.c131
-rw-r--r--fs/xfs/scrub/common.h32
-rw-r--r--fs/xfs/scrub/dir.c35
-rw-r--r--fs/xfs/scrub/ialloc.c7
-rw-r--r--fs/xfs/scrub/inode.c10
-rw-r--r--fs/xfs/scrub/parent.c19
-rw-r--r--fs/xfs/scrub/quota.c181
-rw-r--r--fs/xfs/scrub/refcount.c10
-rw-r--r--fs/xfs/scrub/repair.c1089
-rw-r--r--fs/xfs/scrub/repair.h132
-rw-r--r--fs/xfs/scrub/rmap.c6
-rw-r--r--fs/xfs/scrub/rtbitmap.c60
-rw-r--r--fs/xfs/scrub/scrub.c142
-rw-r--r--fs/xfs/scrub/scrub.h5
-rw-r--r--fs/xfs/scrub/trace.h258
-rw-r--r--fs/xfs/xfs_aops.c74
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_bmap_item.c43
-rw-r--r--fs/xfs/xfs_bmap_util.c13
-rw-r--r--fs/xfs/xfs_buf.c101
-rw-r--r--fs/xfs/xfs_buf.h31
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_discard.c14
-rw-r--r--fs/xfs/xfs_dquot.c780
-rw-r--r--fs/xfs/xfs_dquot.h22
-rw-r--r--fs/xfs/xfs_dquot_item.c7
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_extfree_item.c44
-rw-r--r--fs/xfs/xfs_file.c79
-rw-r--r--fs/xfs/xfs_filestream.c21
-rw-r--r--fs/xfs/xfs_fsmap.c14
-rw-r--r--fs/xfs/xfs_fsops.c587
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c94
-rw-r--r--fs/xfs/xfs_icache.h3
-rw-r--r--fs/xfs/xfs_icreate_item.c4
-rw-r--r--fs/xfs/xfs_inode.c78
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_ioctl.c89
-rw-r--r--fs/xfs/xfs_iomap.c186
-rw-r--r--fs/xfs/xfs_iops.c55
-rw-r--r--fs/xfs/xfs_log.c13
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_cil.c24
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mru_cache.c8
-rw-r--r--fs/xfs/xfs_mru_cache.h8
-rw-r--r--fs/xfs/xfs_qm.c258
-rw-r--r--fs/xfs/xfs_qm.h6
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c97
-rw-r--r--fs/xfs/xfs_quota.h22
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c43
-rw-r--r--fs/xfs/xfs_reflink.c18
-rw-r--r--fs/xfs/xfs_rmap_item.c42
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_stats.c33
-rw-r--r--fs/xfs/xfs_super.c83
-rw-r--r--fs/xfs/xfs_symlink.c12
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c31
-rw-r--r--fs/xfs/xfs_trace.h86
-rw-r--r--fs/xfs/xfs_trans.c89
-rw-r--r--fs/xfs/xfs_trans.h29
-rw-r--r--fs/xfs/xfs_trans_ail.c50
-rw-r--r--fs/xfs/xfs_trans_bmap.c4
-rw-r--r--fs/xfs/xfs_trans_buf.c24
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_extfree.c84
-rw-r--r--fs/xfs/xfs_trans_inode.c3
-rw-r--r--fs/xfs/xfs_trans_priv.h11
-rw-r--r--fs/xfs/xfs_trans_refcount.c4
-rw-r--r--fs/xfs/xfs_trans_rmap.c4
568 files changed, 21096 insertions, 13524 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 64c58eb26159..9eb34701a566 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -55,42 +55,27 @@ int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
}
-static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- struct v9fs_session_info *v9ses;
- uint16_t klen = 0;
-
- v9ses = (struct v9fs_session_info *)cookie_netfs_data;
- p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
- v9ses, buffer, bufmax);
-
- if (v9ses->cachetag)
- klen = strlen(v9ses->cachetag);
-
- if (klen > bufmax)
- return 0;
-
- memcpy(buffer, v9ses->cachetag, klen);
- p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
- return klen;
-}
-
const struct fscache_cookie_def v9fs_cache_session_index_def = {
.name = "9P.session",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = v9fs_cache_session_get_key,
};
void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
{
/* If no cache session tag was specified, we generate a random one. */
- if (!v9ses->cachetag)
- v9fs_random_cachetag(v9ses);
+ if (!v9ses->cachetag) {
+ if (v9fs_random_cachetag(v9ses) < 0) {
+ v9ses->fscache = NULL;
+ return;
+ }
+ }
v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
&v9fs_cache_session_index_def,
- v9ses, true);
+ v9ses->cachetag,
+ strlen(v9ses->cachetag),
+ NULL, 0,
+ v9ses, 0, true);
p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
v9ses, v9ses->fscache);
}
@@ -99,45 +84,15 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
{
p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
v9ses, v9ses->fscache);
- fscache_relinquish_cookie(v9ses->fscache, 0);
+ fscache_relinquish_cookie(v9ses->fscache, NULL, false);
v9ses->fscache = NULL;
}
-
-static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
- memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
- p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
- &v9inode->vfs_inode, v9inode->qid.path);
- return sizeof(v9inode->qid.path);
-}
-
-static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
- *size = i_size_read(&v9inode->vfs_inode);
-
- p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
- &v9inode->vfs_inode, *size);
-}
-
-static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen)
-{
- const struct v9fs_inode *v9inode = cookie_netfs_data;
- memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
- p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
- &v9inode->vfs_inode, v9inode->qid.version);
- return sizeof(v9inode->qid.version);
-}
-
static enum
fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
const void *buffer,
- uint16_t buflen)
+ uint16_t buflen,
+ loff_t object_size)
{
const struct v9fs_inode *v9inode = cookie_netfs_data;
@@ -154,9 +109,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
const struct fscache_cookie_def v9fs_cache_inode_index_def = {
.name = "9p.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = v9fs_cache_inode_get_key,
- .get_attr = v9fs_cache_inode_get_attr,
- .get_aux = v9fs_cache_inode_get_aux,
.check_aux = v9fs_cache_inode_check_aux,
};
@@ -175,7 +127,13 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
- v9inode, true);
+ &v9inode->qid.path,
+ sizeof(v9inode->qid.path),
+ &v9inode->qid.version,
+ sizeof(v9inode->qid.version),
+ v9inode,
+ i_size_read(&v9inode->vfs_inode),
+ true);
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9inode->fscache);
@@ -190,7 +148,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
inode, v9inode->fscache);
- fscache_relinquish_cookie(v9inode->fscache, 0);
+ fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version,
+ false);
v9inode->fscache = NULL;
}
@@ -203,7 +162,7 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
inode, v9inode->fscache);
- fscache_relinquish_cookie(v9inode->fscache, 1);
+ fscache_relinquish_cookie(v9inode->fscache, NULL, true);
v9inode->fscache = NULL;
}
@@ -236,12 +195,18 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
old = v9inode->fscache;
mutex_lock(&v9inode->fscache_lock);
- fscache_relinquish_cookie(v9inode->fscache, 1);
+ fscache_relinquish_cookie(v9inode->fscache, NULL, true);
v9ses = v9fs_inode2v9ses(inode);
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
- v9inode, true);
+ &v9inode->qid.path,
+ sizeof(v9inode->qid.path),
+ &v9inode->qid.version,
+ sizeof(v9inode->qid.version),
+ v9inode,
+ i_size_read(&v9inode->vfs_inode),
+ true);
p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
inode, old, v9inode->fscache);
@@ -367,7 +332,8 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
const struct v9fs_inode *v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
- ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
+ ret = fscache_write_page(v9inode->fscache, page,
+ i_size_read(&v9inode->vfs_inode), GFP_KERNEL);
p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
if (ret != 0)
v9fs_uncache_page(inode, page);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8fb89ddc6cc7..e622f0f10502 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -292,6 +292,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FSCACHE
kfree(v9ses->cachetag);
v9ses->cachetag = match_strdup(&args[0]);
+ if (!v9ses->cachetag) {
+ ret = -ENOMEM;
+ goto free_and_return;
+ }
#endif
break;
case Opt_cache:
@@ -471,6 +475,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
return fid;
err_clnt:
+#ifdef CONFIG_9P_FSCACHE
+ kfree(v9ses->cachetag);
+#endif
p9_client_destroy(v9ses->clnt);
err_names:
kfree(v9ses->uname);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bdabb2765d1b..42e102e2e74a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -579,6 +579,24 @@ static int v9fs_at_to_dotl_flags(int flags)
}
/**
+ * v9fs_dec_count - helper functon to drop i_nlink.
+ *
+ * If a directory had nlink <= 2 (including . and ..), then we should not drop
+ * the link count, which indicates the underlying exported fs doesn't maintain
+ * nlink accurately. e.g.
+ * - overlayfs sets nlink to 1 for merged dir
+ * - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more
+ * than EXT4_LINK_MAX (65000) links.
+ *
+ * @inode: inode whose nlink is being dropped
+ */
+static void v9fs_dec_count(struct inode *inode)
+{
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
+}
+
+/**
* v9fs_remove - helper function to remove files and directories
* @dir: directory inode that is being deleted
* @dentry: dentry that is being deleted
@@ -621,9 +639,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
*/
if (flags & AT_REMOVEDIR) {
clear_nlink(inode);
- drop_nlink(dir);
+ v9fs_dec_count(dir);
} else
- drop_nlink(inode);
+ v9fs_dec_count(inode);
v9fs_invalidate_inode_attr(inode);
v9fs_invalidate_inode_attr(dir);
@@ -805,28 +823,21 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
if (IS_ERR(dfid))
return ERR_CAST(dfid);
- name = dentry->d_name.name;
- fid = p9_client_walk(dfid, 1, &name, 1);
- if (IS_ERR(fid)) {
- if (fid == ERR_PTR(-ENOENT)) {
- d_add(dentry, NULL);
- return NULL;
- }
- return ERR_CAST(fid);
- }
/*
* Make sure we don't use a wrong inode due to parallel
* unlink. For cached mode create calls request for new
* inode. But with cache disabled, lookup should do this.
*/
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+ name = dentry->d_name.name;
+ fid = p9_client_walk(dfid, 1, &name, 1);
+ if (fid == ERR_PTR(-ENOENT))
+ inode = NULL;
+ else if (IS_ERR(fid))
+ inode = ERR_CAST(fid);
+ else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
- if (IS_ERR(inode)) {
- p9_client_clunk(fid);
- return ERR_CAST(inode);
- }
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
@@ -835,12 +846,14 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
* k/b.
*/
res = d_splice_alias(inode, dentry);
- if (!res)
- v9fs_fid_add(dentry, fid);
- else if (!IS_ERR(res))
- v9fs_fid_add(res, fid);
- else
- p9_client_clunk(fid);
+ if (!IS_ERR(fid)) {
+ if (!res)
+ v9fs_fid_add(dentry, fid);
+ else if (!IS_ERR(res))
+ v9fs_fid_add(res, fid);
+ else
+ p9_client_clunk(fid);
+ }
return res;
}
@@ -1024,12 +1037,12 @@ clunk_newdir:
if (S_ISDIR(new_inode->i_mode))
clear_nlink(new_inode);
else
- drop_nlink(new_inode);
+ v9fs_dec_count(new_inode);
}
if (S_ISDIR(old_inode->i_mode)) {
if (!new_inode)
inc_nlink(new_dir);
- drop_nlink(old_dir);
+ v9fs_dec_count(old_dir);
}
v9fs_invalidate_inode_attr(old_inode);
v9fs_invalidate_inode_attr(old_dir);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index af03c2a901eb..48ce50484e80 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,7 +94,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
- sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME;
+ sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
sb->s_flags |= SB_SYNCHRONOUS;
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..ac4ac908f001 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -196,7 +196,7 @@ config HUGETLBFS
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
- <file:Documentation/vm/hugetlbpage.txt> for details.
+ <file:Documentation/admin-guide/mm/hugetlbpage.rst> for details.
If unsure, say N.
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 29444c83da48..e18eff854e1a 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -146,20 +146,6 @@ adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct obje
obj->parent_id = inode->i_ino;
- /*
- * '.' is handled by reserved_lookup() in fs/namei.c
- */
- if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') {
- /*
- * Currently unable to fill in the rest of 'obj',
- * but this is better than nothing. We need to
- * ascend one level to find it's parent.
- */
- obj->name_len = 0;
- obj->file_id = obj->parent_id;
- goto free_out;
- }
-
read_lock(&adfs_dir_lock);
ret = ops->setpos(&dir, 0);
@@ -266,17 +252,17 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
if (error == 0) {
- error = -EACCES;
/*
* This only returns NULL if get_empty_inode
* fails.
*/
inode = adfs_iget(dir->i_sb, &obj);
- if (inode)
- error = 0;
+ if (!inode)
+ inode = ERR_PTR(-EACCES);
+ } else if (error != -ENOENT) {
+ inode = ERR_PTR(error);
}
- d_add(dentry, inode);
- return ERR_PTR(error);
+ return d_splice_alias(inode, dentry);
}
/*
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d8aa0ae3d037..41c5749f4db7 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -201,14 +201,16 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
struct super_block *sb = dir->i_sb;
struct buffer_head *bh;
struct inode *inode = NULL;
+ struct dentry *res;
pr_debug("%s(\"%pd\")\n", __func__, dentry);
affs_lock_dir(dir);
bh = affs_find_entry(dir, dentry);
- affs_unlock_dir(dir);
- if (IS_ERR(bh))
+ if (IS_ERR(bh)) {
+ affs_unlock_dir(dir);
return ERR_CAST(bh);
+ }
if (bh) {
u32 ino = bh->b_blocknr;
@@ -222,11 +224,12 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
}
affs_brelse(bh);
inode = affs_iget(sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
}
- d_add(dentry, inode);
- return NULL;
+ res = d_splice_alias(inode, dentry);
+ if (!IS_ERR_OR_NULL(res))
+ res->d_fsdata = dentry->d_fsdata;
+ affs_unlock_dir(dir);
+ return res;
}
int
diff --git a/fs/affs/super.c b/fs/affs/super.c
index e602619aed9d..d1ad11a8a4a5 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -241,6 +241,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
break;
case Opt_prefix:
+ kfree(*prefix);
*prefix = match_strdup(&args[0]);
if (!*prefix)
return 0;
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 45b7fc405fa6..532acae25453 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -12,6 +12,8 @@ kafs-objs := \
cell.o \
cmservice.o \
dir.o \
+ dir_edit.o \
+ dynroot.o \
file.o \
flock.o \
fsclient.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index fd9f28b8a933..7587fb665ff1 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -121,7 +121,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
p = text;
do {
struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
- char tdelim = delim;
+ const char *q, *stop;
if (*p == delim) {
p++;
@@ -130,28 +130,33 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
if (*p == '[') {
p++;
- tdelim = ']';
+ q = memchr(p, ']', end - p);
+ } else {
+ for (q = p; q < end; q++)
+ if (*q == '+' || *q == delim)
+ break;
}
- if (in4_pton(p, end - p,
+ if (in4_pton(p, q - p,
(u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
- tdelim, &p)) {
+ -1, &stop)) {
srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
- } else if (in6_pton(p, end - p,
+ } else if (in6_pton(p, q - p,
srx->transport.sin6.sin6_addr.s6_addr,
- tdelim, &p)) {
+ -1, &stop)) {
/* Nothing to do */
} else {
goto bad_address;
}
- if (tdelim == ']') {
- if (p == end || *p != ']')
- goto bad_address;
+ if (stop != q)
+ goto bad_address;
+
+ p = q;
+ if (q < end && *q == ']')
p++;
- }
if (p < end) {
if (*p == '+') {
@@ -243,9 +248,9 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
xport == a->sin6_port)
return;
if (xdr == a->sin6_addr.s6_addr32[3] &&
- xport < a->sin6_port)
+ (u16 __force)xport < (u16 __force)a->sin6_port)
break;
- if (xdr < a->sin6_addr.s6_addr32[3])
+ if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3])
break;
}
@@ -280,7 +285,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
xport == a->sin6_port)
return;
if (diff == 0 &&
- xport < a->sin6_port)
+ (u16 __force)xport < (u16 __force)a->sin6_port)
break;
if (diff < 0)
break;
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index b94d0edc2b78..b4ff1f7ae4ab 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -67,10 +67,14 @@ typedef enum {
} afs_callback_type_t;
struct afs_callback {
- struct afs_fid fid; /* file identifier */
- unsigned version; /* callback version */
- unsigned expiry; /* time at which expires */
- afs_callback_type_t type; /* type of callback */
+ unsigned version; /* Callback version */
+ unsigned expiry; /* Time at which expires */
+ afs_callback_type_t type; /* Type of callback */
+};
+
+struct afs_callback_break {
+ struct afs_fid fid; /* File identifier */
+ struct afs_callback cb; /* Callback details */
};
#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */
@@ -123,21 +127,20 @@ typedef u32 afs_access_t;
* AFS file status information
*/
struct afs_file_status {
- unsigned if_version; /* interface version */
-#define AFS_FSTATUS_VERSION 1
+ u64 size; /* file size */
+ afs_dataversion_t data_version; /* current data version */
+ time_t mtime_client; /* last time client changed data */
+ time_t mtime_server; /* last time server changed data */
+ unsigned abort_code; /* Abort if bulk-fetching this failed */
afs_file_type_t type; /* file type */
unsigned nlink; /* link count */
- u64 size; /* file size */
- afs_dataversion_t data_version; /* current data version */
u32 author; /* author ID */
- kuid_t owner; /* owner ID */
- kgid_t group; /* group ID */
+ u32 owner; /* owner ID */
+ u32 group; /* group ID */
afs_access_t caller_access; /* access rights for authenticated caller */
afs_access_t anon_access; /* access rights for unauthenticated caller */
umode_t mode; /* UNIX mode */
- time_t mtime_client; /* last time client changed data */
- time_t mtime_server; /* last time server changed data */
s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
};
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index d47b6d01e4c0..ddfa88a7a9c0 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -31,10 +31,12 @@ enum AFS_FS_Operations {
FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */
FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */
FSGETROOTVOLUME = 151, /* AFS Get root volume name */
+ FSBULKSTATUS = 155, /* AFS Fetch multiple file statuses */
FSSETLOCK = 156, /* AFS Request a file lock */
FSEXTENDLOCK = 157, /* AFS Extend a file lock */
FSRELEASELOCK = 158, /* AFS Release a file lock */
FSLOOKUP = 161, /* AFS lookup file in directory */
+ FSINLINEBULKSTATUS = 65536, /* AFS Fetch multiple file statuses with inline errors */
FSFETCHDATA64 = 65537, /* AFS Fetch file data */
FSSTOREDATA64 = 65538, /* AFS Store file data */
FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index f62ff71d28c9..b1c31ec4523a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -12,167 +12,39 @@
#include <linux/sched.h>
#include "internal.h"
-static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
-static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
-
-static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
-static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
- uint64_t *size);
-static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t buflen);
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
const void *buffer,
- uint16_t buflen);
+ uint16_t buflen,
+ loff_t object_size);
struct fscache_netfs afs_cache_netfs = {
.name = "afs",
- .version = 1,
+ .version = 2,
};
struct fscache_cookie_def afs_cell_cache_index_def = {
.name = "AFS.cell",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = afs_cell_cache_get_key,
};
struct fscache_cookie_def afs_volume_cache_index_def = {
.name = "AFS.volume",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = afs_volume_cache_get_key,
};
struct fscache_cookie_def afs_vnode_cache_index_def = {
- .name = "AFS.vnode",
- .type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = afs_vnode_cache_get_key,
- .get_attr = afs_vnode_cache_get_attr,
- .get_aux = afs_vnode_cache_get_aux,
- .check_aux = afs_vnode_cache_check_aux,
+ .name = "AFS.vnode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .check_aux = afs_vnode_cache_check_aux,
};
/*
- * set the key for the index entry
- */
-static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_cell *cell = cookie_netfs_data;
- uint16_t klen;
-
- _enter("%p,%p,%u", cell, buffer, bufmax);
-
- klen = strlen(cell->name);
- if (klen > bufmax)
- return 0;
-
- memcpy(buffer, cell->name, klen);
- return klen;
-}
-
-/*****************************************************************************/
-/*
- * set the key for the volume index entry
- */
-static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_volume *volume = cookie_netfs_data;
- struct {
- u64 volid;
- } __packed key;
-
- _enter("{%u},%p,%u", volume->type, buffer, bufmax);
-
- if (bufmax < sizeof(key))
- return 0;
-
- key.volid = volume->vid;
- memcpy(buffer, &key, sizeof(key));
- return sizeof(key);
-}
-
-/*****************************************************************************/
-/*
- * set the key for the index entry
- */
-static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_vnode *vnode = cookie_netfs_data;
- struct {
- u32 vnode_id[3];
- } __packed key;
-
- _enter("{%x,%x,%llx},%p,%u",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
- buffer, bufmax);
-
- /* Allow for a 96-bit key */
- memset(&key, 0, sizeof(key));
- key.vnode_id[0] = vnode->fid.vnode;
- key.vnode_id[1] = 0;
- key.vnode_id[2] = 0;
-
- if (sizeof(key) > bufmax)
- return 0;
-
- memcpy(buffer, &key, sizeof(key));
- return sizeof(key);
-}
-
-/*
- * provide updated file attributes
- */
-static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct afs_vnode *vnode = cookie_netfs_data;
-
- _enter("{%x,%x,%llx},",
- vnode->fid.vnode, vnode->fid.unique,
- vnode->status.data_version);
-
- *size = vnode->status.size;
-}
-
-struct afs_vnode_cache_aux {
- u64 data_version;
- u32 fid_unique;
-} __packed;
-
-/*
- * provide new auxiliary cache data
- */
-static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct afs_vnode *vnode = cookie_netfs_data;
- struct afs_vnode_cache_aux aux;
-
- _enter("{%x,%x,%Lx},%p,%u",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
- buffer, bufmax);
-
- memset(&aux, 0, sizeof(aux));
- aux.data_version = vnode->status.data_version;
- aux.fid_unique = vnode->fid.unique;
-
- if (bufmax < sizeof(aux))
- return 0;
-
- memcpy(buffer, &aux, sizeof(aux));
- return sizeof(aux);
-}
-
-/*
* check that the auxiliary data indicates that the entry is still valid
*/
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
const void *buffer,
- uint16_t buflen)
+ uint16_t buflen,
+ loff_t object_size)
{
struct afs_vnode *vnode = cookie_netfs_data;
struct afs_vnode_cache_aux aux;
@@ -189,12 +61,6 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OBSOLETE;
}
- if (vnode->fid.unique != aux.fid_unique) {
- _leave(" = OBSOLETE [uniq %x != %x]",
- aux.fid_unique, vnode->fid.unique);
- return FSCACHE_CHECKAUX_OBSOLETE;
- }
-
if (vnode->status.data_version != aux.data_version) {
_leave(" = OBSOLETE [vers %llx != %llx]",
aux.data_version, vnode->status.data_version);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index f4291b576054..571437dcb252 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -23,36 +23,55 @@
/*
* Set up an interest-in-callbacks record for a volume on a server and
* register it with the server.
- * - Called with volume->server_sem held.
+ * - Called with vnode->io_lock held.
*/
int afs_register_server_cb_interest(struct afs_vnode *vnode,
- struct afs_server_entry *entry)
+ struct afs_server_list *slist,
+ unsigned int index)
{
- struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x;
+ struct afs_server_entry *entry = &slist->servers[index];
+ struct afs_cb_interest *cbi, *vcbi, *new, *old;
struct afs_server *server = entry->server;
again:
+ if (vnode->cb_interest &&
+ likely(vnode->cb_interest == entry->cb_interest))
+ return 0;
+
+ read_lock(&slist->lock);
+ cbi = afs_get_cb_interest(entry->cb_interest);
+ read_unlock(&slist->lock);
+
vcbi = vnode->cb_interest;
if (vcbi) {
- if (vcbi == cbi)
+ if (vcbi == cbi) {
+ afs_put_cb_interest(afs_v2net(vnode), cbi);
return 0;
+ }
+ /* Use a new interest in the server list for the same server
+ * rather than an old one that's still attached to a vnode.
+ */
if (cbi && vcbi->server == cbi->server) {
write_seqlock(&vnode->cb_lock);
- vnode->cb_interest = afs_get_cb_interest(cbi);
+ old = vnode->cb_interest;
+ vnode->cb_interest = cbi;
write_sequnlock(&vnode->cb_lock);
- afs_put_cb_interest(afs_v2net(vnode), cbi);
+ afs_put_cb_interest(afs_v2net(vnode), old);
return 0;
}
+ /* Re-use the one attached to the vnode. */
if (!cbi && vcbi->server == server) {
- afs_get_cb_interest(vcbi);
- x = cmpxchg(&entry->cb_interest, cbi, vcbi);
- if (x != cbi) {
- cbi = x;
- afs_put_cb_interest(afs_v2net(vnode), vcbi);
+ write_lock(&slist->lock);
+ if (entry->cb_interest) {
+ write_unlock(&slist->lock);
+ afs_put_cb_interest(afs_v2net(vnode), cbi);
goto again;
}
+
+ entry->cb_interest = cbi;
+ write_unlock(&slist->lock);
return 0;
}
}
@@ -72,13 +91,16 @@ again:
list_add_tail(&new->cb_link, &server->cb_interests);
write_unlock(&server->cb_break_lock);
- x = cmpxchg(&entry->cb_interest, cbi, new);
- if (x == cbi) {
+ write_lock(&slist->lock);
+ if (!entry->cb_interest) {
+ entry->cb_interest = afs_get_cb_interest(new);
cbi = new;
+ new = NULL;
} else {
- cbi = x;
- afs_put_cb_interest(afs_v2net(vnode), new);
+ cbi = afs_get_cb_interest(entry->cb_interest);
}
+ write_unlock(&slist->lock);
+ afs_put_cb_interest(afs_v2net(vnode), new);
}
ASSERT(cbi);
@@ -88,35 +110,18 @@ again:
*/
write_seqlock(&vnode->cb_lock);
- vnode->cb_interest = afs_get_cb_interest(cbi);
+ old = vnode->cb_interest;
+ vnode->cb_interest = cbi;
vnode->cb_s_break = cbi->server->cb_s_break;
+ vnode->cb_v_break = vnode->volume->cb_v_break;
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
write_sequnlock(&vnode->cb_lock);
+ afs_put_cb_interest(afs_v2net(vnode), old);
return 0;
}
/*
- * Set a vnode's interest on a server.
- */
-void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi)
-{
- struct afs_cb_interest *old_cbi = NULL;
-
- if (vnode->cb_interest == cbi)
- return;
-
- write_seqlock(&vnode->cb_lock);
- if (vnode->cb_interest != cbi) {
- afs_get_cb_interest(cbi);
- old_cbi = vnode->cb_interest;
- vnode->cb_interest = cbi;
- }
- write_sequnlock(&vnode->cb_lock);
- afs_put_cb_interest(afs_v2net(vnode), cbi);
-}
-
-/*
* Remove an interest on a server.
*/
void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
@@ -150,6 +155,7 @@ void afs_break_callback(struct afs_vnode *vnode)
write_seqlock(&vnode->cb_lock);
+ clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
vnode->cb_break++;
afs_clear_permits(vnode);
@@ -190,13 +196,24 @@ static void afs_break_one_callback(struct afs_server *server,
if (cbi->vid != fid->vid)
continue;
- data.volume = NULL;
- data.fid = *fid;
- inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data);
- if (inode) {
- vnode = AFS_FS_I(inode);
- afs_break_callback(vnode);
- iput(inode);
+ if (fid->vnode == 0 && fid->unique == 0) {
+ /* The callback break applies to an entire volume. */
+ struct afs_super_info *as = AFS_FS_S(cbi->sb);
+ struct afs_volume *volume = as->volume;
+
+ write_lock(&volume->cb_break_lock);
+ volume->cb_v_break++;
+ write_unlock(&volume->cb_break_lock);
+ } else {
+ data.volume = NULL;
+ data.fid = *fid;
+ inode = ilookup5_nowait(cbi->sb, fid->vnode,
+ afs_iget5_test, &data);
+ if (inode) {
+ vnode = AFS_FS_I(inode);
+ afs_break_callback(vnode);
+ iput(inode);
+ }
}
}
@@ -207,21 +224,23 @@ static void afs_break_one_callback(struct afs_server *server,
* allow the fileserver to break callback promises
*/
void afs_break_callbacks(struct afs_server *server, size_t count,
- struct afs_callback callbacks[])
+ struct afs_callback_break *callbacks)
{
_enter("%p,%zu,", server, count);
ASSERT(server != NULL);
ASSERTCMP(count, <=, AFSCBMAX);
+ /* TODO: Sort the callback break list by volume ID */
+
for (; count > 0; callbacks++, count--) {
_debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }",
callbacks->fid.vid,
callbacks->fid.vnode,
callbacks->fid.unique,
- callbacks->version,
- callbacks->expiry,
- callbacks->type
+ callbacks->cb.version,
+ callbacks->cb.expiry,
+ callbacks->cb.type
);
afs_break_one_callback(server, &callbacks->fid);
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 3d2c5e0e854e..fdf4c36cff79 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -18,7 +18,7 @@
#include <keys/rxrpc-type.h>
#include "internal.h"
-unsigned __read_mostly afs_cell_gc_delay = 10;
+static unsigned __read_mostly afs_cell_gc_delay = 10;
static void afs_manage_cell(struct work_struct *);
@@ -75,7 +75,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
cell = rcu_dereference_raw(net->ws_cell);
if (cell) {
afs_get_cell(cell);
- continue;
+ break;
}
ret = -EDESTADDRREQ;
continue;
@@ -130,6 +130,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
}
+ if (namelen == 5 && memcmp(name, "@cell", 5) == 0)
+ return ERR_PTR(-EINVAL);
_enter("%*.*s,%s", namelen, namelen, name, vllist);
@@ -334,8 +336,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
return PTR_ERR(new_root);
}
- set_bit(AFS_CELL_FL_NO_GC, &new_root->flags);
- afs_get_cell(new_root);
+ if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags))
+ afs_get_cell(new_root);
/* install the new cell */
write_seqlock(&net->cells_lock);
@@ -411,7 +413,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
ASSERTCMP(atomic_read(&cell->usage), ==, 0);
- afs_put_addrlist(cell->vl_addrs);
+ afs_put_addrlist(rcu_access_pointer(cell->vl_addrs));
key_put(cell->anonymous_key);
kfree(cell);
@@ -522,7 +524,9 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
#ifdef CONFIG_AFS_FSCACHE
cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
&afs_cell_cache_index_def,
- cell, true);
+ cell->name, strlen(cell->name),
+ NULL, 0,
+ cell, 0, true);
#endif
ret = afs_proc_cell_setup(net, cell);
if (ret < 0)
@@ -547,7 +551,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
spin_unlock(&net->proc_cells_lock);
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(cell->cache, 0);
+ fscache_relinquish_cookie(cell->cache, NULL, false);
cell->cache = NULL;
#endif
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 41e277f57b20..c332c95a6940 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -133,21 +133,10 @@ bool afs_cm_incoming_call(struct afs_call *call)
}
/*
- * clean up a cache manager call
+ * Clean up a cache manager call.
*/
static void afs_cm_destructor(struct afs_call *call)
{
- _enter("");
-
- /* Break the callbacks here so that we do it after the final ACK is
- * received. The step number here must match the final number in
- * afs_deliver_cb_callback().
- */
- if (call->unmarshall == 5) {
- ASSERT(call->cm_server && call->count && call->request);
- afs_break_callbacks(call->cm_server, call->count, call->request);
- }
-
kfree(call->buffer);
call->buffer = NULL;
}
@@ -161,14 +150,14 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
_enter("");
- /* be sure to send the reply *before* attempting to spam the AFS server
- * with FSFetchStatus requests on the vnodes with broken callbacks lest
- * the AFS server get into a vicious cycle of trying to break further
- * callbacks because it hadn't received completion of the CBCallBack op
- * yet */
- afs_send_empty_reply(call);
+ /* We need to break the callbacks before sending the reply as the
+ * server holds up change visibility till it receives our reply so as
+ * to maintain cache coherency.
+ */
+ if (call->cm_server)
+ afs_break_callbacks(call->cm_server, call->count, call->request);
- afs_break_callbacks(call->cm_server, call->count, call->request);
+ afs_send_empty_reply(call);
afs_put_call(call);
_leave("");
}
@@ -178,9 +167,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
*/
static int afs_deliver_cb_callback(struct afs_call *call)
{
+ struct afs_callback_break *cb;
struct sockaddr_rxrpc srx;
- struct afs_callback *cb;
- struct afs_server *server;
__be32 *bp;
int ret, loop;
@@ -201,7 +189,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->count = ntohl(call->tmp);
_debug("FID count: %u", call->count);
if (call->count > AFSCBMAX)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
if (!call->buffer)
@@ -218,7 +206,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
_debug("unmarshall FID array");
call->request = kcalloc(call->count,
- sizeof(struct afs_callback),
+ sizeof(struct afs_callback_break),
GFP_KERNEL);
if (!call->request)
return -ENOMEM;
@@ -229,7 +217,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
cb->fid.vid = ntohl(*bp++);
cb->fid.vnode = ntohl(*bp++);
cb->fid.unique = ntohl(*bp++);
- cb->type = AFSCM_CB_UNTYPED;
+ cb->cb.type = AFSCM_CB_UNTYPED;
}
call->offset = 0;
@@ -245,7 +233,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
call->count2 = ntohl(call->tmp);
_debug("CB count: %u", call->count2);
if (call->count2 != call->count && call->count2 != 0)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
call->offset = 0;
call->unmarshall++;
@@ -260,22 +248,13 @@ static int afs_deliver_cb_callback(struct afs_call *call)
cb = call->request;
bp = call->buffer;
for (loop = call->count2; loop > 0; loop--, cb++) {
- cb->version = ntohl(*bp++);
- cb->expiry = ntohl(*bp++);
- cb->type = ntohl(*bp++);
+ cb->cb.version = ntohl(*bp++);
+ cb->cb.expiry = ntohl(*bp++);
+ cb->cb.type = ntohl(*bp++);
}
call->offset = 0;
call->unmarshall++;
-
- /* Record that the message was unmarshalled successfully so
- * that the call destructor can know do the callback breaking
- * work, even if the final ACK isn't received.
- *
- * If the step number changes, then afs_cm_destructor() must be
- * updated also.
- */
- call->unmarshall++;
case 5:
break;
}
@@ -286,10 +265,9 @@ static int afs_deliver_cb_callback(struct afs_call *call)
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
- server = afs_find_server(call->net, &srx);
- if (!server)
- return -ENOTCONN;
- call->cm_server = server;
+ call->cm_server = afs_find_server(call->net, &srx);
+ if (!call->cm_server)
+ trace_afs_cm_no_server(call, &srx);
return afs_queue_call_work(call);
}
@@ -303,7 +281,8 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
_enter("{%p}", call->cm_server);
- afs_init_callback_state(call->cm_server);
+ if (call->cm_server)
+ afs_init_callback_state(call->cm_server);
afs_send_empty_reply(call);
afs_put_call(call);
_leave("");
@@ -315,7 +294,6 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
struct sockaddr_rxrpc srx;
- struct afs_server *server;
int ret;
_enter("");
@@ -328,10 +306,9 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- server = afs_find_server(call->net, &srx);
- if (!server)
- return -ENOTCONN;
- call->cm_server = server;
+ call->cm_server = afs_find_server(call->net, &srx);
+ if (!call->cm_server)
+ trace_afs_cm_no_server(call, &srx);
return afs_queue_call_work(call);
}
@@ -341,8 +318,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
*/
static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
{
- struct sockaddr_rxrpc srx;
- struct afs_server *server;
struct afs_uuid *r;
unsigned loop;
__be32 *b;
@@ -398,11 +373,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
/* we'll need the file server record as that tells us which set of
* vnodes to operate upon */
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
- server = afs_find_server(call->net, &srx);
- if (!server)
- return -ENOTCONN;
- call->cm_server = server;
+ rcu_read_lock();
+ call->cm_server = afs_find_server_by_uuid(call->net, call->request);
+ rcu_read_unlock();
+ if (!call->cm_server)
+ trace_afs_cm_no_server_u(call, call->request);
return afs_queue_call_work(call);
}
@@ -500,9 +475,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
b = call->buffer;
r = call->request;
- r->time_low = ntohl(b[0]);
- r->time_mid = ntohl(b[1]);
- r->time_hi_and_version = ntohl(b[2]);
+ r->time_low = b[0];
+ r->time_mid = htons(ntohl(b[1]));
+ r->time_hi_and_version = htons(ntohl(b[2]));
r->clock_seq_hi_and_reserved = ntohl(b[3]);
r->clock_seq_low = ntohl(b[4]);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index ba2b458b36d1..7d623008157f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1,6 +1,6 @@
/* dir.c: AFS filesystem directory handling
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2018 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -10,27 +10,26 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
+#include <linux/swap.h>
#include <linux/ctype.h>
#include <linux/sched.h>
-#include <linux/dns_resolver.h>
+#include <linux/task_io_accounting_ops.h>
#include "internal.h"
+#include "xdr_fs.h"
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
-static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags);
static int afs_dir_open(struct inode *inode, struct file *file);
static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
-static void afs_d_release(struct dentry *dentry);
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+ loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl);
static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
@@ -43,6 +42,14 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
+static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
+static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
+
+static int afs_dir_set_page_dirty(struct page *page)
+{
+ BUG(); /* This should never happen. */
+}
const struct file_operations afs_dir_file_operations = {
.open = afs_dir_open,
@@ -67,15 +74,10 @@ const struct inode_operations afs_dir_inode_operations = {
.listxattr = afs_listxattr,
};
-const struct file_operations afs_dynroot_file_operations = {
- .open = dcache_dir_open,
- .release = dcache_dir_close,
- .iterate_shared = dcache_readdir,
- .llseek = dcache_dir_lseek,
-};
-
-const struct inode_operations afs_dynroot_inode_operations = {
- .lookup = afs_dynroot_lookup,
+const struct address_space_operations afs_dir_aops = {
+ .set_page_dirty = afs_dir_set_page_dirty,
+ .releasepage = afs_dir_releasepage,
+ .invalidatepage = afs_dir_invalidatepage,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -85,91 +87,38 @@ const struct dentry_operations afs_fs_dentry_operations = {
.d_automount = afs_d_automount,
};
-#define AFS_DIR_HASHTBL_SIZE 128
-#define AFS_DIR_DIRENT_SIZE 32
-#define AFS_DIRENT_PER_BLOCK 64
-
-union afs_dirent {
- struct {
- uint8_t valid;
- uint8_t unused[1];
- __be16 hash_next;
- __be32 vnode;
- __be32 unique;
- uint8_t name[16];
- uint8_t overflow[4]; /* if any char of the name (inc
- * NUL) reaches here, consume
- * the next dirent too */
- } u;
- uint8_t extended_name[32];
-};
-
-/* AFS directory page header (one at the beginning of every 2048-byte chunk) */
-struct afs_dir_pagehdr {
- __be16 npages;
- __be16 magic;
-#define AFS_DIR_MAGIC htons(1234)
- uint8_t nentries;
- uint8_t bitmap[8];
- uint8_t pad[19];
-};
-
-/* directory block layout */
-union afs_dir_block {
-
- struct afs_dir_pagehdr pagehdr;
-
- struct {
- struct afs_dir_pagehdr pagehdr;
- uint8_t alloc_ctrs[128];
- /* dir hash table */
- uint16_t hashtable[AFS_DIR_HASHTBL_SIZE];
- } hdr;
-
- union afs_dirent dirents[AFS_DIRENT_PER_BLOCK];
-};
-
-/* layout on a linux VM page */
-struct afs_dir_page {
- union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
+struct afs_lookup_one_cookie {
+ struct dir_context ctx;
+ struct qstr name;
+ bool found;
+ struct afs_fid fid;
};
struct afs_lookup_cookie {
- struct dir_context ctx;
- struct afs_fid fid;
- struct qstr name;
- int found;
+ struct dir_context ctx;
+ struct qstr name;
+ bool found;
+ bool one_only;
+ unsigned short nr_fids;
+ struct afs_file_status *statuses;
+ struct afs_callback *callbacks;
+ struct afs_fid fids[50];
};
/*
* check that a directory page is valid
*/
-bool afs_dir_check_page(struct inode *dir, struct page *page)
+static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
+ loff_t i_size)
{
- struct afs_dir_page *dbuf;
- struct afs_vnode *vnode = AFS_FS_I(dir);
- loff_t latter, i_size, off;
+ struct afs_xdr_dir_page *dbuf;
+ loff_t latter, off;
int tmp, qty;
-#if 0
- /* check the page count */
- qty = desc.size / sizeof(dbuf->blocks[0]);
- if (qty == 0)
- goto error;
-
- if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
- printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
- __func__, dir->i_ino, qty,
- ntohs(dbuf->blocks[0].pagehdr.npages));
- goto error;
- }
-#endif
-
/* Determine how many magic numbers there should be in this page, but
* we must take care because the directory may change size under us.
*/
off = page_offset(page);
- i_size = i_size_read(dir);
if (i_size <= off)
goto checked;
@@ -178,112 +127,222 @@ bool afs_dir_check_page(struct inode *dir, struct page *page)
qty = PAGE_SIZE;
else
qty = latter;
- qty /= sizeof(union afs_dir_block);
+ qty /= sizeof(union afs_xdr_dir_block);
/* check them */
- dbuf = page_address(page);
+ dbuf = kmap(page);
for (tmp = 0; tmp < qty; tmp++) {
- if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
+ if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
- __func__, dir->i_ino, tmp, qty,
- ntohs(dbuf->blocks[tmp].pagehdr.magic));
- trace_afs_dir_check_failed(vnode, off, i_size);
+ __func__, dvnode->vfs_inode.i_ino, tmp, qty,
+ ntohs(dbuf->blocks[tmp].hdr.magic));
+ trace_afs_dir_check_failed(dvnode, off, i_size);
+ kunmap(page);
goto error;
}
+
+ /* Make sure each block is NUL terminated so we can reasonably
+ * use string functions on it. The filenames in the page
+ * *should* be NUL-terminated anyway.
+ */
+ ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0;
}
+ kunmap(page);
+
checked:
- SetPageChecked(page);
+ afs_stat_v(dvnode, n_read_dir);
return true;
error:
- SetPageError(page);
return false;
}
/*
- * discard a page cached in the pagecache
+ * open an AFS directory file
*/
-static inline void afs_dir_put_page(struct page *page)
+static int afs_dir_open(struct inode *inode, struct file *file)
{
- kunmap(page);
- unlock_page(page);
- put_page(page);
+ _enter("{%lu}", inode->i_ino);
+
+ BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048);
+ BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32);
+
+ if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
+ return -ENOENT;
+
+ return afs_open(inode, file);
}
/*
- * get a page into the pagecache
+ * Read the directory into the pagecache in one go, scrubbing the previous
+ * contents. The list of pages is returned, pinning them so that they don't
+ * get reclaimed during the iteration.
*/
-static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
- struct key *key)
+static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
+ __acquires(&dvnode->validate_lock)
{
- struct page *page;
- _enter("{%lu},%lu", dir->i_ino, index);
-
- page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
- if (!IS_ERR(page)) {
- lock_page(page);
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (PageError(page))
- goto fail;
- }
+ struct afs_read *req;
+ loff_t i_size;
+ int nr_pages, nr_inline, i, n;
+ int ret = -ENOMEM;
+
+retry:
+ i_size = i_size_read(&dvnode->vfs_inode);
+ if (i_size < 2048)
+ return ERR_PTR(-EIO);
+ if (i_size > 2048 * 1024)
+ return ERR_PTR(-EFBIG);
+
+ _enter("%llu", i_size);
+
+ /* Get a request record to hold the page list. We want to hold it
+ * inline if we can, but we don't want to make an order 1 allocation.
+ */
+ nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE;
+ nr_inline = nr_pages;
+ if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *))
+ nr_inline = 0;
+
+ req = kzalloc(sizeof(*req) + sizeof(struct page *) * nr_inline,
+ GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ refcount_set(&req->usage, 1);
+ req->nr_pages = nr_pages;
+ req->actual_len = i_size; /* May change */
+ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
+ req->data_version = dvnode->status.data_version; /* May change */
+ if (nr_inline > 0) {
+ req->pages = req->array;
+ } else {
+ req->pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!req->pages)
+ goto error;
}
- return page;
-fail:
- afs_dir_put_page(page);
- _leave(" = -EIO");
- return ERR_PTR(-EIO);
-}
+ /* Get a list of all the pages that hold or will hold the directory
+ * content. We need to fill in any gaps that we might find where the
+ * memory reclaimer has been at work. If there are any gaps, we will
+ * need to reread the entire directory contents.
+ */
+ i = 0;
+ do {
+ n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i,
+ req->nr_pages - i,
+ req->pages + i);
+ _debug("find %u at %u/%u", n, i, req->nr_pages);
+ if (n == 0) {
+ gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask;
+
+ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_stat_v(dvnode, n_inval);
+
+ ret = -ENOMEM;
+ req->pages[i] = __page_cache_alloc(gfp);
+ if (!req->pages[i])
+ goto error;
+ ret = add_to_page_cache_lru(req->pages[i],
+ dvnode->vfs_inode.i_mapping,
+ i, gfp);
+ if (ret < 0)
+ goto error;
+
+ set_page_private(req->pages[i], 1);
+ SetPagePrivate(req->pages[i]);
+ unlock_page(req->pages[i]);
+ i++;
+ } else {
+ i += n;
+ }
+ } while (i < req->nr_pages);
-/*
- * open an AFS directory file
- */
-static int afs_dir_open(struct inode *inode, struct file *file)
-{
- _enter("{%lu}", inode->i_ino);
+ /* If we're going to reload, we need to lock all the pages to prevent
+ * races.
+ */
+ ret = -ERESTARTSYS;
+ if (down_read_killable(&dvnode->validate_lock) < 0)
+ goto error;
- BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
- BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ goto success;
- if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
- return -ENOENT;
+ up_read(&dvnode->validate_lock);
+ if (down_write_killable(&dvnode->validate_lock) < 0)
+ goto error;
- return afs_open(inode, file);
+ if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
+ ret = afs_fetch_data(dvnode, key, req);
+ if (ret < 0)
+ goto error_unlock;
+
+ task_io_account_read(PAGE_SIZE * req->nr_pages);
+
+ if (req->len < req->file_size)
+ goto content_has_grown;
+
+ /* Validate the data we just read. */
+ ret = -EIO;
+ for (i = 0; i < req->nr_pages; i++)
+ if (!afs_dir_check_page(dvnode, req->pages[i],
+ req->actual_len))
+ goto error_unlock;
+
+ // TODO: Trim excess pages
+
+ set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
+ }
+
+ downgrade_write(&dvnode->validate_lock);
+success:
+ return req;
+
+error_unlock:
+ up_write(&dvnode->validate_lock);
+error:
+ afs_put_read(req);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+
+content_has_grown:
+ up_write(&dvnode->validate_lock);
+ afs_put_read(req);
+ goto retry;
}
/*
* deal with one block in an AFS directory
*/
static int afs_dir_iterate_block(struct dir_context *ctx,
- union afs_dir_block *block,
+ union afs_xdr_dir_block *block,
unsigned blkoff)
{
- union afs_dirent *dire;
+ union afs_xdr_dirent *dire;
unsigned offset, next, curr;
size_t nlen;
int tmp;
_enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
- curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
+ curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent);
/* walk through the block, an entry at a time */
- for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
- offset < AFS_DIRENT_PER_BLOCK;
+ for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
+ offset < AFS_DIR_SLOTS_PER_BLOCK;
offset = next
) {
next = offset + 1;
/* skip entries marked unused in the bitmap */
- if (!(block->pagehdr.bitmap[offset / 8] &
+ if (!(block->hdr.bitmap[offset / 8] &
(1 << (offset % 8)))) {
_debug("ENT[%zu.%u]: unused",
- blkoff / sizeof(union afs_dir_block), offset);
+ blkoff / sizeof(union afs_xdr_dir_block), offset);
if (offset >= curr)
ctx->pos = blkoff +
- next * sizeof(union afs_dirent);
+ next * sizeof(union afs_xdr_dirent);
continue;
}
@@ -291,34 +350,34 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
dire = &block->dirents[offset];
nlen = strnlen(dire->u.name,
sizeof(*block) -
- offset * sizeof(union afs_dirent));
+ offset * sizeof(union afs_xdr_dirent));
_debug("ENT[%zu.%u]: %s %zu \"%s\"",
- blkoff / sizeof(union afs_dir_block), offset,
+ blkoff / sizeof(union afs_xdr_dir_block), offset,
(offset < curr ? "skip" : "fill"),
nlen, dire->u.name);
/* work out where the next possible entry is */
- for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) {
- if (next >= AFS_DIRENT_PER_BLOCK) {
+ for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_xdr_dirent)) {
+ if (next >= AFS_DIR_SLOTS_PER_BLOCK) {
_debug("ENT[%zu.%u]:"
" %u travelled beyond end dir block"
" (len %u/%zu)",
- blkoff / sizeof(union afs_dir_block),
+ blkoff / sizeof(union afs_xdr_dir_block),
offset, next, tmp, nlen);
return -EIO;
}
- if (!(block->pagehdr.bitmap[next / 8] &
+ if (!(block->hdr.bitmap[next / 8] &
(1 << (next % 8)))) {
_debug("ENT[%zu.%u]:"
" %u unmarked extension (len %u/%zu)",
- blkoff / sizeof(union afs_dir_block),
+ blkoff / sizeof(union afs_xdr_dir_block),
offset, next, tmp, nlen);
return -EIO;
}
_debug("ENT[%zu.%u]: ext %u/%zu",
- blkoff / sizeof(union afs_dir_block),
+ blkoff / sizeof(union afs_xdr_dir_block),
next, tmp, nlen);
next++;
}
@@ -330,13 +389,14 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
- ctx->actor == afs_lookup_filldir ?
+ (ctx->actor == afs_lookup_filldir ||
+ ctx->actor == afs_lookup_one_filldir)?
ntohl(dire->u.unique) : DT_UNKNOWN)) {
_leave(" = 0 [full]");
return 0;
}
- ctx->pos = blkoff + next * sizeof(union afs_dirent);
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
}
_leave(" = 1 [more]");
@@ -349,8 +409,10 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
struct key *key)
{
- union afs_dir_block *dblock;
- struct afs_dir_page *dbuf;
+ struct afs_vnode *dvnode = AFS_FS_I(dir);
+ struct afs_xdr_dir_page *dbuf;
+ union afs_xdr_dir_block *dblock;
+ struct afs_read *req;
struct page *page;
unsigned blkoff, limit;
int ret;
@@ -362,45 +424,54 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
return -ESTALE;
}
+ req = afs_read_dir(dvnode, key);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
/* round the file position up to the next entry boundary */
- ctx->pos += sizeof(union afs_dirent) - 1;
- ctx->pos &= ~(sizeof(union afs_dirent) - 1);
+ ctx->pos += sizeof(union afs_xdr_dirent) - 1;
+ ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1);
/* walk through the blocks in sequence */
ret = 0;
- while (ctx->pos < dir->i_size) {
- blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
+ while (ctx->pos < req->actual_len) {
+ blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1);
- /* fetch the appropriate page from the directory */
- page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
+ /* Fetch the appropriate page from the directory and re-add it
+ * to the LRU.
+ */
+ page = req->pages[blkoff / PAGE_SIZE];
+ if (!page) {
+ ret = -EIO;
break;
}
+ mark_page_accessed(page);
limit = blkoff & ~(PAGE_SIZE - 1);
- dbuf = page_address(page);
+ dbuf = kmap(page);
/* deal with the individual blocks stashed on this page */
do {
dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
- sizeof(union afs_dir_block)];
+ sizeof(union afs_xdr_dir_block)];
ret = afs_dir_iterate_block(ctx, dblock, blkoff);
if (ret != 1) {
- afs_dir_put_page(page);
+ kunmap(page);
goto out;
}
- blkoff += sizeof(union afs_dir_block);
+ blkoff += sizeof(union afs_xdr_dir_block);
} while (ctx->pos < dir->i_size && blkoff < limit);
- afs_dir_put_page(page);
+ kunmap(page);
ret = 0;
}
out:
+ up_read(&dvnode->validate_lock);
+ afs_put_read(req);
_leave(" = %d", ret);
return ret;
}
@@ -414,23 +485,23 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
}
/*
- * search the directory for a name
+ * Search the directory for a single name
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
- int nlen, loff_t fpos, u64 ino, unsigned dtype)
+static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
+ int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
- struct afs_lookup_cookie *cookie =
- container_of(ctx, struct afs_lookup_cookie, ctx);
+ struct afs_lookup_one_cookie *cookie =
+ container_of(ctx, struct afs_lookup_one_cookie, ctx);
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
(unsigned long long) ino, dtype);
/* insanity checks first */
- BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
- BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+ BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048);
+ BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32);
if (cookie->name.len != nlen ||
memcmp(cookie->name.name, name, nlen) != 0) {
@@ -447,15 +518,15 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
}
/*
- * do a lookup in a directory
+ * Do a lookup of a single name in a directory
* - just returns the FID the dentry name maps to if found
*/
-static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
- struct afs_fid *fid, struct key *key)
+static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
+ struct afs_fid *fid, struct key *key)
{
struct afs_super_info *as = dir->i_sb->s_fs_info;
- struct afs_lookup_cookie cookie = {
- .ctx.actor = afs_lookup_filldir,
+ struct afs_lookup_one_cookie cookie = {
+ .ctx.actor = afs_lookup_one_filldir,
.name = dentry->d_name,
.fid.vid = as->volume->vid
};
@@ -482,70 +553,265 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
}
/*
- * Probe to see if a cell may exist. This prevents positive dentries from
- * being created unnecessarily.
+ * search the directory for a name
+ * - if afs_dir_iterate_block() spots this function, it'll pass the FID
+ * uniquifier through dtype
*/
-static int afs_probe_cell_name(struct dentry *dentry)
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+ int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
- struct afs_cell *cell;
- const char *name = dentry->d_name.name;
- size_t len = dentry->d_name.len;
+ struct afs_lookup_cookie *cookie =
+ container_of(ctx, struct afs_lookup_cookie, ctx);
int ret;
- /* Names prefixed with a dot are R/W mounts. */
- if (name[0] == '.') {
- if (len == 1)
- return -EINVAL;
- name++;
- len--;
- }
+ _enter("{%s,%u},%s,%u,,%llu,%u",
+ cookie->name.name, cookie->name.len, name, nlen,
+ (unsigned long long) ino, dtype);
- cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len);
- if (!IS_ERR(cell)) {
- afs_put_cell(afs_d2net(dentry), cell);
- return 0;
+ /* insanity checks first */
+ BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048);
+ BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32);
+
+ if (cookie->found) {
+ if (cookie->nr_fids < 50) {
+ cookie->fids[cookie->nr_fids].vnode = ino;
+ cookie->fids[cookie->nr_fids].unique = dtype;
+ cookie->nr_fids++;
+ }
+ } else if (cookie->name.len == nlen &&
+ memcmp(cookie->name.name, name, nlen) == 0) {
+ cookie->fids[0].vnode = ino;
+ cookie->fids[0].unique = dtype;
+ cookie->found = 1;
+ if (cookie->one_only)
+ return -1;
}
- ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL);
- if (ret == -ENODATA)
- ret = -EDESTADDRREQ;
+ ret = cookie->nr_fids >= 50 ? -1 : 0;
+ _leave(" = %d", ret);
return ret;
}
/*
- * Try to auto mount the mountpoint with pseudo directory, if the autocell
- * operation is setted.
+ * Do a lookup in a directory. We make use of bulk lookup to query a slew of
+ * files in one go and create inodes for them. The inode of the file we were
+ * asked for is returned.
*/
-static struct inode *afs_try_auto_mntpt(struct dentry *dentry,
- struct inode *dir, struct afs_fid *fid)
+static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+ struct key *key)
{
- struct afs_vnode *vnode = AFS_FS_I(dir);
- struct inode *inode;
- int ret = -ENOENT;
+ struct afs_lookup_cookie *cookie;
+ struct afs_cb_interest *cbi = NULL;
+ struct afs_super_info *as = dir->i_sb->s_fs_info;
+ struct afs_iget_data data;
+ struct afs_fs_cursor fc;
+ struct afs_vnode *dvnode = AFS_FS_I(dir);
+ struct inode *inode = NULL;
+ int ret, i;
+
+ _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
+
+ cookie = kzalloc(sizeof(struct afs_lookup_cookie), GFP_KERNEL);
+ if (!cookie)
+ return ERR_PTR(-ENOMEM);
+
+ cookie->ctx.actor = afs_lookup_filldir;
+ cookie->name = dentry->d_name;
+ cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */
- _enter("%p{%pd}, {%x:%u}",
- dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
+ read_seqlock_excl(&dvnode->cb_lock);
+ if (dvnode->cb_interest &&
+ dvnode->cb_interest->server &&
+ test_bit(AFS_SERVER_FL_NO_IBULK, &dvnode->cb_interest->server->flags))
+ cookie->one_only = true;
+ read_sequnlock_excl(&dvnode->cb_lock);
- if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+ for (i = 0; i < 50; i++)
+ cookie->fids[i].vid = as->volume->vid;
+
+ /* search the directory */
+ ret = afs_dir_iterate(dir, &cookie->ctx, key);
+ if (ret < 0) {
+ inode = ERR_PTR(ret);
goto out;
+ }
- ret = afs_probe_cell_name(dentry);
- if (ret < 0)
+ inode = ERR_PTR(-ENOENT);
+ if (!cookie->found)
goto out;
- inode = afs_iget_pseudo_dir(dir->i_sb, false);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
+ /* Check to see if we already have an inode for the primary fid. */
+ data.volume = dvnode->volume;
+ data.fid = cookie->fids[0];
+ inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, afs_iget5_test, &data);
+ if (inode)
+ goto out;
+
+ /* Need space for examining all the selected files */
+ inode = ERR_PTR(-ENOMEM);
+ cookie->statuses = kcalloc(cookie->nr_fids, sizeof(struct afs_file_status),
+ GFP_KERNEL);
+ if (!cookie->statuses)
goto out;
+
+ cookie->callbacks = kcalloc(cookie->nr_fids, sizeof(struct afs_callback),
+ GFP_KERNEL);
+ if (!cookie->callbacks)
+ goto out_s;
+
+ /* Try FS.InlineBulkStatus first. Abort codes for the individual
+ * lookups contained therein are stored in the reply without aborting
+ * the whole operation.
+ */
+ if (cookie->one_only)
+ goto no_inline_bulk_status;
+
+ inode = ERR_PTR(-ERESTARTSYS);
+ if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+ while (afs_select_fileserver(&fc)) {
+ if (test_bit(AFS_SERVER_FL_NO_IBULK,
+ &fc.cbi->server->flags)) {
+ fc.ac.abort_code = RX_INVALID_OPERATION;
+ fc.ac.error = -ECONNABORTED;
+ break;
+ }
+ afs_fs_inline_bulk_status(&fc,
+ afs_v2net(dvnode),
+ cookie->fids,
+ cookie->statuses,
+ cookie->callbacks,
+ cookie->nr_fids, NULL);
+ }
+
+ if (fc.ac.error == 0)
+ cbi = afs_get_cb_interest(fc.cbi);
+ if (fc.ac.abort_code == RX_INVALID_OPERATION)
+ set_bit(AFS_SERVER_FL_NO_IBULK, &fc.cbi->server->flags);
+ inode = ERR_PTR(afs_end_vnode_operation(&fc));
}
- *fid = AFS_FS_I(inode)->fid;
- _leave("= %p", inode);
- return inode;
+ if (!IS_ERR(inode))
+ goto success;
+ if (fc.ac.abort_code != RX_INVALID_OPERATION)
+ goto out_c;
+no_inline_bulk_status:
+ /* We could try FS.BulkStatus next, but this aborts the entire op if
+ * any of the lookups fails - so, for the moment, revert to
+ * FS.FetchStatus for just the primary fid.
+ */
+ cookie->nr_fids = 1;
+ inode = ERR_PTR(-ERESTARTSYS);
+ if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+ while (afs_select_fileserver(&fc)) {
+ afs_fs_fetch_status(&fc,
+ afs_v2net(dvnode),
+ cookie->fids,
+ cookie->statuses,
+ cookie->callbacks,
+ NULL);
+ }
+
+ if (fc.ac.error == 0)
+ cbi = afs_get_cb_interest(fc.cbi);
+ inode = ERR_PTR(afs_end_vnode_operation(&fc));
+ }
+
+ if (IS_ERR(inode))
+ goto out_c;
+
+ for (i = 0; i < cookie->nr_fids; i++)
+ cookie->statuses[i].abort_code = 0;
+
+success:
+ /* Turn all the files into inodes and save the first one - which is the
+ * one we actually want.
+ */
+ if (cookie->statuses[0].abort_code != 0)
+ inode = ERR_PTR(afs_abort_to_error(cookie->statuses[0].abort_code));
+
+ for (i = 0; i < cookie->nr_fids; i++) {
+ struct inode *ti;
+
+ if (cookie->statuses[i].abort_code != 0)
+ continue;
+
+ ti = afs_iget(dir->i_sb, key, &cookie->fids[i],
+ &cookie->statuses[i],
+ &cookie->callbacks[i],
+ cbi);
+ if (i == 0) {
+ inode = ti;
+ } else {
+ if (!IS_ERR(ti))
+ iput(ti);
+ }
+ }
+
+out_c:
+ afs_put_cb_interest(afs_v2net(dvnode), cbi);
+ kfree(cookie->callbacks);
+out_s:
+ kfree(cookie->statuses);
out:
- _leave("= %d", ret);
- return ERR_PTR(ret);
+ kfree(cookie);
+ return inode;
+}
+
+/*
+ * Look up an entry in a directory with @sys substitution.
+ */
+static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry,
+ struct key *key)
+{
+ struct afs_sysnames *subs;
+ struct afs_net *net = afs_i2net(dir);
+ struct dentry *ret;
+ char *buf, *p, *name;
+ int len, i;
+
+ _enter("");
+
+ ret = ERR_PTR(-ENOMEM);
+ p = buf = kmalloc(AFSNAMEMAX, GFP_KERNEL);
+ if (!buf)
+ goto out_p;
+ if (dentry->d_name.len > 4) {
+ memcpy(p, dentry->d_name.name, dentry->d_name.len - 4);
+ p += dentry->d_name.len - 4;
+ }
+
+ /* There is an ordered list of substitutes that we have to try. */
+ read_lock(&net->sysnames_lock);
+ subs = net->sysnames;
+ refcount_inc(&subs->usage);
+ read_unlock(&net->sysnames_lock);
+
+ for (i = 0; i < subs->nr; i++) {
+ name = subs->subs[i];
+ len = dentry->d_name.len - 4 + strlen(name);
+ if (len >= AFSNAMEMAX) {
+ ret = ERR_PTR(-ENAMETOOLONG);
+ goto out_s;
+ }
+
+ strcpy(p, name);
+ ret = lookup_one_len(buf, dentry->d_parent, len);
+ if (IS_ERR(ret) || d_is_positive(ret))
+ goto out_s;
+ dput(ret);
+ }
+
+ /* We don't want to d_add() the @sys dentry here as we don't want to
+ * the cached dentry to hide changes to the sysnames list.
+ */
+ ret = NULL;
+out_s:
+ afs_put_sysnames(subs);
+ kfree(buf);
+out_p:
+ key_put(key);
+ return ret;
}
/*
@@ -554,16 +820,13 @@ out:
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct afs_vnode *vnode;
- struct afs_fid fid;
+ struct afs_vnode *dvnode = AFS_FS_I(dir);
struct inode *inode;
struct key *key;
int ret;
- vnode = AFS_FS_I(dir);
-
_enter("{%x:%u},%p{%pd},",
- vnode->fid.vid, vnode->fid.vnode, dentry, dentry);
+ dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry);
ASSERTCMP(d_inode(dentry), ==, NULL);
@@ -572,28 +835,37 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
}
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) {
_leave(" = -ESTALE");
return ERR_PTR(-ESTALE);
}
- key = afs_request_key(vnode->volume->cell);
+ key = afs_request_key(dvnode->volume->cell);
if (IS_ERR(key)) {
_leave(" = %ld [key]", PTR_ERR(key));
return ERR_CAST(key);
}
- ret = afs_validate(vnode, key);
+ ret = afs_validate(dvnode, key);
if (ret < 0) {
key_put(key);
_leave(" = %d [val]", ret);
return ERR_PTR(ret);
}
- ret = afs_do_lookup(dir, dentry, &fid, key);
- if (ret < 0) {
+ if (dentry->d_name.len >= 4 &&
+ dentry->d_name.name[dentry->d_name.len - 4] == '@' &&
+ dentry->d_name.name[dentry->d_name.len - 3] == 's' &&
+ dentry->d_name.name[dentry->d_name.len - 2] == 'y' &&
+ dentry->d_name.name[dentry->d_name.len - 1] == 's')
+ return afs_lookup_atsys(dir, dentry, key);
+
+ afs_stat_v(dvnode, n_lookup);
+ inode = afs_do_lookup(dir, dentry, key);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
if (ret == -ENOENT) {
- inode = afs_try_auto_mntpt(dentry, dir, &fid);
+ inode = afs_try_auto_mntpt(dentry, dir);
if (!IS_ERR(inode)) {
key_put(key);
goto success;
@@ -611,10 +883,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
_leave(" = %d [do]", ret);
return ERR_PTR(ret);
}
- dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
+ dentry->d_fsdata = (void *)(unsigned long)dvnode->status.data_version;
/* instantiate the dentry */
- inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL);
key_put(key);
if (IS_ERR(inode)) {
_leave(" = %ld", PTR_ERR(inode));
@@ -623,9 +894,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
success:
d_add(dentry, inode);
- _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }",
- fid.vnode,
- fid.unique,
+ _leave(" = 0 { ino=%lu v=%u }",
d_inode(dentry)->i_ino,
d_inode(dentry)->i_generation);
@@ -633,67 +902,23 @@ success:
}
/*
- * Look up an entry in a dynroot directory.
- */
-static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct afs_vnode *vnode;
- struct afs_fid fid;
- struct inode *inode;
- int ret;
-
- vnode = AFS_FS_I(dir);
-
- _enter("%pd", dentry);
-
- ASSERTCMP(d_inode(dentry), ==, NULL);
-
- if (dentry->d_name.len >= AFSNAMEMAX) {
- _leave(" = -ENAMETOOLONG");
- return ERR_PTR(-ENAMETOOLONG);
- }
-
- inode = afs_try_auto_mntpt(dentry, dir, &fid);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- d_add(dentry, NULL);
- _leave(" = NULL [negative]");
- return NULL;
- }
- _leave(" = %d [do]", ret);
- return ERR_PTR(ret);
- }
-
- d_add(dentry, inode);
- _leave(" = 0 { ino=%lu v=%u }",
- d_inode(dentry)->i_ino, d_inode(dentry)->i_generation);
- return NULL;
-}
-
-/*
* check that a dentry lookup hit has found a valid entry
* - NOTE! the hit can be a negative hit too, so we can't assume we have an
* inode
*/
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct afs_super_info *as = dentry->d_sb->s_fs_info;
struct afs_vnode *vnode, *dir;
struct afs_fid uninitialized_var(fid);
struct dentry *parent;
struct inode *inode;
struct key *key;
- void *dir_version;
+ long dir_version, de_version;
int ret;
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (as->dyn_root)
- return 1;
-
if (d_really_is_positive(dentry)) {
vnode = AFS_FS_I(d_inode(dentry));
_enter("{v={%x:%u} n=%pd fl=%lx},",
@@ -729,14 +954,25 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_bad_parent;
}
- dir_version = (void *) (unsigned long) dir->status.data_version;
- if (dentry->d_fsdata == dir_version)
- goto out_valid; /* the dir contents are unchanged */
+ /* We only need to invalidate a dentry if the server's copy changed
+ * behind our back. If we made the change, it's no problem. Note that
+ * on a 32-bit system, we only have 32 bits in the dentry to store the
+ * version.
+ */
+ dir_version = (long)dir->status.data_version;
+ de_version = (long)dentry->d_fsdata;
+ if (de_version == dir_version)
+ goto out_valid;
+
+ dir_version = (long)dir->invalid_before;
+ if (de_version - dir_version >= 0)
+ goto out_valid;
_debug("dir modified");
+ afs_stat_v(dir, n_reval);
/* search the directory for this vnode */
- ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
+ ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key);
switch (ret) {
case 0:
/* the filename maps to something */
@@ -789,7 +1025,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
}
out_valid:
- dentry->d_fsdata = dir_version;
+ dentry->d_fsdata = (void *)dir_version;
dput(parent);
key_put(key);
_leave(" = 1 [valid]");
@@ -840,7 +1076,7 @@ zap:
/*
* handle dentry release
*/
-static void afs_d_release(struct dentry *dentry)
+void afs_d_release(struct dentry *dentry)
{
_enter("%pd", dentry);
}
@@ -854,6 +1090,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
struct afs_file_status *newstatus,
struct afs_callback *newcb)
{
+ struct afs_vnode *vnode;
struct inode *inode;
if (fc->ac.error < 0)
@@ -871,6 +1108,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
return;
}
+ vnode = AFS_FS_I(inode);
+ set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
d_add(new_dentry, inode);
}
@@ -885,6 +1124,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct afs_fid newfid;
struct key *key;
+ u64 data_version = dvnode->status.data_version;
int ret;
mode |= S_IFDIR;
@@ -901,8 +1141,8 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
- afs_fs_create(&fc, dentry->d_name.name, mode,
+ fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+ afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
&newfid, &newstatus, &newcb);
}
@@ -916,6 +1156,11 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto error_key;
}
+ if (ret == 0 &&
+ test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+ afs_edit_dir_for_create);
+
key_put(key);
_leave(" = 0");
return 0;
@@ -939,6 +1184,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
clear_nlink(&vnode->vfs_inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
}
}
@@ -950,6 +1196,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ u64 data_version = dvnode->status.data_version;
int ret;
_enter("{%x:%u},{%pd}",
@@ -964,14 +1211,19 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
- afs_fs_remove(&fc, dentry->d_name.name, true);
+ fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+ afs_fs_remove(&fc, dentry->d_name.name, true,
+ data_version);
}
afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
ret = afs_end_vnode_operation(&fc);
- if (ret == 0)
+ if (ret == 0) {
afs_dir_remove_subdir(dentry);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_rmdir);
+ }
}
key_put(key);
@@ -1036,6 +1288,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
struct key *key;
unsigned long d_version = (unsigned long)dentry->d_fsdata;
+ u64 data_version = dvnode->status.data_version;
int ret;
_enter("{%x:%u},{%pd}",
@@ -1061,8 +1314,9 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
- afs_fs_remove(&fc, dentry->d_name.name, false);
+ fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+ afs_fs_remove(&fc, dentry->d_name.name, false,
+ data_version);
}
afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
@@ -1071,6 +1325,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
ret = afs_dir_remove_link(
dentry, key, d_version,
(unsigned long)dvnode->status.data_version);
+ if (ret == 0 &&
+ test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
}
error_key:
@@ -1092,6 +1350,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct afs_fid newfid;
struct key *key;
+ u64 data_version = dvnode->status.data_version;
int ret;
mode |= S_IFREG;
@@ -1112,8 +1371,8 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
- afs_fs_create(&fc, dentry->d_name.name, mode,
+ fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+ afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
&newfid, &newstatus, &newcb);
}
@@ -1127,6 +1386,10 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto error_key;
}
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+ afs_edit_dir_for_create);
+
key_put(key);
_leave(" = 0");
return 0;
@@ -1148,10 +1411,12 @@ static int afs_link(struct dentry *from, struct inode *dir,
struct afs_fs_cursor fc;
struct afs_vnode *dvnode, *vnode;
struct key *key;
+ u64 data_version;
int ret;
vnode = AFS_FS_I(d_inode(from));
dvnode = AFS_FS_I(dir);
+ data_version = dvnode->status.data_version;
_enter("{%x:%u},{%x:%u},{%pd}",
vnode->fid.vid, vnode->fid.vnode,
@@ -1176,9 +1441,9 @@ static int afs_link(struct dentry *from, struct inode *dir,
}
while (afs_select_fileserver(&fc)) {
- fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
- fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break;
- afs_fs_link(&fc, vnode, dentry->d_name.name);
+ fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+ fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
+ afs_fs_link(&fc, vnode, dentry->d_name.name, data_version);
}
afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
@@ -1194,6 +1459,10 @@ static int afs_link(struct dentry *from, struct inode *dir,
goto error_key;
}
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid,
+ afs_edit_dir_for_link);
+
key_put(key);
_leave(" = 0");
return 0;
@@ -1217,6 +1486,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct afs_fid newfid;
struct key *key;
+ u64 data_version = dvnode->status.data_version;
int ret;
_enter("{%x:%u},{%pd},%s",
@@ -1240,8 +1510,9 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
- afs_fs_symlink(&fc, dentry->d_name.name, content,
+ fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+ afs_fs_symlink(&fc, dentry->d_name.name,
+ content, data_version,
&newfid, &newstatus);
}
@@ -1255,6 +1526,10 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
goto error_key;
}
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+ afs_edit_dir_for_symlink);
+
key_put(key);
_leave(" = 0");
return 0;
@@ -1277,6 +1552,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct afs_fs_cursor fc;
struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
struct key *key;
+ u64 orig_data_version, new_data_version;
+ bool new_negative = d_is_negative(new_dentry);
int ret;
if (flags)
@@ -1285,6 +1562,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
vnode = AFS_FS_I(d_inode(old_dentry));
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
+ orig_data_version = orig_dvnode->status.data_version;
+ new_data_version = new_dvnode->status.data_version;
_enter("{%x:%u},{%x:%u},{%x:%u},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1307,10 +1586,11 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
while (afs_select_fileserver(&fc)) {
- fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break;
- fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode);
+ fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
afs_fs_rename(&fc, old_dentry->d_name.name,
- new_dvnode, new_dentry->d_name.name);
+ new_dvnode, new_dentry->d_name.name,
+ orig_data_version, new_data_version);
}
afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break);
@@ -1322,9 +1602,68 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto error_key;
}
+ if (ret == 0) {
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags))
+ afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
+ afs_edit_dir_for_rename);
+
+ if (!new_negative &&
+ test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
+ afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
+ afs_edit_dir_for_rename);
+
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
+ afs_edit_dir_add(new_dvnode, &new_dentry->d_name,
+ &vnode->fid, afs_edit_dir_for_rename);
+ }
+
error_key:
key_put(key);
error:
_leave(" = %d", ret);
return ret;
}
+
+/*
+ * Release a directory page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
+ */
+static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
+
+ _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index);
+
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+
+ /* The directory will need reloading. */
+ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_stat_v(dvnode, n_relpg);
+ return 1;
+}
+
+/*
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ * the entire page)
+ */
+static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host);
+
+ _enter("{%lu},%u,%u", page->index, offset, length);
+
+ BUG_ON(!PageLocked(page));
+
+ /* The directory will need reloading. */
+ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ afs_stat_v(dvnode, n_inval);
+
+ /* we clean up only if the entire page is being invalidated */
+ if (offset == 0 && length == PAGE_SIZE) {
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ }
+}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
new file mode 100644
index 000000000000..8b400f5aead5
--- /dev/null
+++ b/fs/afs/dir_edit.c
@@ -0,0 +1,505 @@
+/* AFS filesystem directory editing
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/iversion.h>
+#include "internal.h"
+#include "xdr_fs.h"
+
+/*
+ * Find a number of contiguous clear bits in a directory block bitmask.
+ *
+ * There are 64 slots, which means we can load the entire bitmap into a
+ * variable. The first bit doesn't count as it corresponds to the block header
+ * slot. nr_slots is between 1 and 9.
+ */
+static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_slots)
+{
+ u64 bitmap;
+ u32 mask;
+ int bit, n;
+
+ bitmap = (u64)block->hdr.bitmap[0] << 0 * 8;
+ bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8;
+ bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8;
+ bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8;
+ bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8;
+ bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8;
+ bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8;
+ bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8;
+ bitmap >>= 1; /* The first entry is metadata */
+ bit = 1;
+ mask = (1 << nr_slots) - 1;
+
+ do {
+ if (sizeof(unsigned long) == 8)
+ n = ffz(bitmap);
+ else
+ n = ((u32)bitmap) != 0 ?
+ ffz((u32)bitmap) :
+ ffz((u32)(bitmap >> 32)) + 32;
+ bitmap >>= n;
+ bit += n;
+
+ if ((bitmap & mask) == 0) {
+ if (bit > 64 - nr_slots)
+ return -1;
+ return bit;
+ }
+
+ n = __ffs(bitmap);
+ bitmap >>= n;
+ bit += n;
+ } while (bitmap);
+
+ return -1;
+}
+
+/*
+ * Set a number of contiguous bits in the directory block bitmap.
+ */
+static void afs_set_contig_bits(union afs_xdr_dir_block *block,
+ int bit, unsigned int nr_slots)
+{
+ u64 mask, before, after;
+
+ mask = (1 << nr_slots) - 1;
+ mask <<= bit;
+
+ before = *(u64 *)block->hdr.bitmap;
+
+ block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8);
+ block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8);
+ block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8);
+ block->hdr.bitmap[3] |= (u8)(mask >> 3 * 8);
+ block->hdr.bitmap[4] |= (u8)(mask >> 4 * 8);
+ block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8);
+ block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8);
+ block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8);
+
+ after = *(u64 *)block->hdr.bitmap;
+}
+
+/*
+ * Clear a number of contiguous bits in the directory block bitmap.
+ */
+static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
+ int bit, unsigned int nr_slots)
+{
+ u64 mask, before, after;
+
+ mask = (1 << nr_slots) - 1;
+ mask <<= bit;
+
+ before = *(u64 *)block->hdr.bitmap;
+
+ block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8);
+ block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8);
+ block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8);
+ block->hdr.bitmap[3] &= ~(u8)(mask >> 3 * 8);
+ block->hdr.bitmap[4] &= ~(u8)(mask >> 4 * 8);
+ block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8);
+ block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8);
+ block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8);
+
+ after = *(u64 *)block->hdr.bitmap;
+}
+
+/*
+ * Scan a directory block looking for a dirent of the right name.
+ */
+static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
+ unsigned int blocknum)
+{
+ union afs_xdr_dirent *de;
+ u64 bitmap;
+ int d, len, n;
+
+ _enter("");
+
+ bitmap = (u64)block->hdr.bitmap[0] << 0 * 8;
+ bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8;
+ bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8;
+ bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8;
+ bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8;
+ bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8;
+ bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8;
+ bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8;
+
+ for (d = (blocknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
+ d < AFS_DIR_SLOTS_PER_BLOCK;
+ d++) {
+ if (!((bitmap >> d) & 1))
+ continue;
+ de = &block->dirents[d];
+ if (de->u.valid != 1)
+ continue;
+
+ /* The block was NUL-terminated by afs_dir_check_page(). */
+ len = strlen(de->u.name);
+ if (len == name->len &&
+ memcmp(de->u.name, name->name, name->len) == 0)
+ return d;
+
+ n = round_up(12 + len + 1 + 4, AFS_DIR_DIRENT_SIZE);
+ n /= AFS_DIR_DIRENT_SIZE;
+ d += n - 1;
+ }
+
+ return -1;
+}
+
+/*
+ * Initialise a new directory block. Note that block 0 is special and contains
+ * some extra metadata.
+ */
+static void afs_edit_init_block(union afs_xdr_dir_block *meta,
+ union afs_xdr_dir_block *block, int block_num)
+{
+ memset(block, 0, sizeof(*block));
+ block->hdr.npages = htons(1);
+ block->hdr.magic = AFS_DIR_MAGIC;
+ block->hdr.bitmap[0] = 1;
+
+ if (block_num == 0) {
+ block->hdr.bitmap[0] = 0xff;
+ block->hdr.bitmap[1] = 0x1f;
+ memset(block->meta.alloc_ctrs,
+ AFS_DIR_SLOTS_PER_BLOCK,
+ sizeof(block->meta.alloc_ctrs));
+ meta->meta.alloc_ctrs[0] =
+ AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS0;
+ }
+
+ if (block_num < AFS_DIR_BLOCKS_WITH_CTR)
+ meta->meta.alloc_ctrs[block_num] =
+ AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS;
+}
+
+/*
+ * Edit a directory's file data to add a new directory entry. Doing this after
+ * create, mkdir, symlink, link or rename if the data version number is
+ * incremented by exactly one avoids the need to re-download the entire
+ * directory contents.
+ *
+ * The caller must hold the inode locked.
+ */
+void afs_edit_dir_add(struct afs_vnode *vnode,
+ struct qstr *name, struct afs_fid *new_fid,
+ enum afs_edit_dir_reason why)
+{
+ union afs_xdr_dir_block *meta, *block;
+ struct afs_xdr_dir_page *meta_page, *dir_page;
+ union afs_xdr_dirent *de;
+ struct page *page0, *page;
+ unsigned int need_slots, nr_blocks, b;
+ pgoff_t index;
+ loff_t i_size;
+ gfp_t gfp;
+ int slot;
+
+ _enter(",,{%d,%s},", name->len, name->name);
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
+ (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ return;
+ }
+
+ gfp = vnode->vfs_inode.i_mapping->gfp_mask;
+ page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp);
+ if (!page0) {
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ _leave(" [fgp]");
+ return;
+ }
+
+ /* Work out how many slots we're going to need. */
+ need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE);
+ need_slots /= AFS_DIR_DIRENT_SIZE;
+
+ meta_page = kmap(page0);
+ meta = &meta_page->blocks[0];
+ if (i_size == 0)
+ goto new_directory;
+ nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
+
+ /* Find a block that has sufficient slots available. Each VM page
+ * contains two or more directory blocks.
+ */
+ for (b = 0; b < nr_blocks + 1; b++) {
+ /* If the directory extended into a new page, then we need to
+ * tack a new page on the end.
+ */
+ index = b / AFS_DIR_BLOCKS_PER_PAGE;
+ if (index == 0) {
+ page = page0;
+ dir_page = meta_page;
+ } else {
+ if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
+ goto error;
+ gfp = vnode->vfs_inode.i_mapping->gfp_mask;
+ page = find_or_create_page(vnode->vfs_inode.i_mapping,
+ index, gfp);
+ if (!page)
+ goto error;
+ if (!PagePrivate(page)) {
+ set_page_private(page, 1);
+ SetPagePrivate(page);
+ }
+ dir_page = kmap(page);
+ }
+
+ /* Abandon the edit if we got a callback break. */
+ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ goto invalidated;
+
+ block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE];
+
+ _debug("block %u: %2u %3u %u",
+ b,
+ (b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99,
+ ntohs(block->hdr.npages),
+ ntohs(block->hdr.magic));
+
+ /* Initialise the block if necessary. */
+ if (b == nr_blocks) {
+ _debug("init %u", b);
+ afs_edit_init_block(meta, block, b);
+ i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE);
+ }
+
+ /* Only lower dir pages have a counter in the header. */
+ if (b >= AFS_DIR_BLOCKS_WITH_CTR ||
+ meta->meta.alloc_ctrs[b] >= need_slots) {
+ /* We need to try and find one or more consecutive
+ * slots to hold the entry.
+ */
+ slot = afs_find_contig_bits(block, need_slots);
+ if (slot >= 0) {
+ _debug("slot %u", slot);
+ goto found_space;
+ }
+ }
+
+ if (page != page0) {
+ unlock_page(page);
+ kunmap(page);
+ put_page(page);
+ }
+ }
+
+ /* There are no spare slots of sufficient size, yet the operation
+ * succeeded. Download the directory again.
+ */
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out_unmap;
+
+new_directory:
+ afs_edit_init_block(meta, meta, 0);
+ i_size = AFS_DIR_BLOCK_SIZE;
+ i_size_write(&vnode->vfs_inode, i_size);
+ slot = AFS_DIR_RESV_BLOCKS0;
+ page = page0;
+ block = meta;
+ nr_blocks = 1;
+ b = 0;
+
+found_space:
+ /* Set the dirent slot. */
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_create, b, slot,
+ new_fid->vnode, new_fid->unique, name->name);
+ de = &block->dirents[slot];
+ de->u.valid = 1;
+ de->u.unused[0] = 0;
+ de->u.hash_next = 0; // TODO: Really need to maintain this
+ de->u.vnode = htonl(new_fid->vnode);
+ de->u.unique = htonl(new_fid->unique);
+ memcpy(de->u.name, name->name, name->len + 1);
+ de->u.name[name->len] = 0;
+
+ /* Adjust the bitmap. */
+ afs_set_contig_bits(block, slot, need_slots);
+ if (page != page0) {
+ unlock_page(page);
+ kunmap(page);
+ put_page(page);
+ }
+
+ /* Adjust the allocation counter. */
+ if (b < AFS_DIR_BLOCKS_WITH_CTR)
+ meta->meta.alloc_ctrs[b] -= need_slots;
+
+ inode_inc_iversion_raw(&vnode->vfs_inode);
+ afs_stat_v(vnode, n_dir_cr);
+ _debug("Insert %s in %u[%u]", name->name, b, slot);
+
+out_unmap:
+ unlock_page(page0);
+ kunmap(page0);
+ put_page(page0);
+ _leave("");
+ return;
+
+invalidated:
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ if (page != page0) {
+ kunmap(page);
+ put_page(page);
+ }
+ goto out_unmap;
+
+error:
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out_unmap;
+}
+
+/*
+ * Edit a directory's file data to remove a new directory entry. Doing this
+ * after unlink, rmdir or rename if the data version number is incremented by
+ * exactly one avoids the need to re-download the entire directory contents.
+ *
+ * The caller must hold the inode locked.
+ */
+void afs_edit_dir_remove(struct afs_vnode *vnode,
+ struct qstr *name, enum afs_edit_dir_reason why)
+{
+ struct afs_xdr_dir_page *meta_page, *dir_page;
+ union afs_xdr_dir_block *meta, *block;
+ union afs_xdr_dirent *de;
+ struct page *page0, *page;
+ unsigned int need_slots, nr_blocks, b;
+ pgoff_t index;
+ loff_t i_size;
+ int slot;
+
+ _enter(",,{%d,%s},", name->len, name->name);
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (i_size < AFS_DIR_BLOCK_SIZE ||
+ i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
+ (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ return;
+ }
+ nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
+
+ page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0);
+ if (!page0) {
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ _leave(" [fgp]");
+ return;
+ }
+
+ /* Work out how many slots we're going to discard. */
+ need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE);
+ need_slots /= AFS_DIR_DIRENT_SIZE;
+
+ meta_page = kmap(page0);
+ meta = &meta_page->blocks[0];
+
+ /* Find a page that has sufficient slots available. Each VM page
+ * contains two or more directory blocks.
+ */
+ for (b = 0; b < nr_blocks; b++) {
+ index = b / AFS_DIR_BLOCKS_PER_PAGE;
+ if (index != 0) {
+ page = find_lock_page(vnode->vfs_inode.i_mapping, index);
+ if (!page)
+ goto error;
+ dir_page = kmap(page);
+ } else {
+ page = page0;
+ dir_page = meta_page;
+ }
+
+ /* Abandon the edit if we got a callback break. */
+ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ goto invalidated;
+
+ block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE];
+
+ if (b > AFS_DIR_BLOCKS_WITH_CTR ||
+ meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
+ slot = afs_dir_scan_block(block, name, b);
+ if (slot >= 0)
+ goto found_dirent;
+ }
+
+ if (page != page0) {
+ unlock_page(page);
+ kunmap(page);
+ put_page(page);
+ }
+ }
+
+ /* Didn't find the dirent to clobber. Download the directory again. */
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
+ 0, 0, 0, 0, name->name);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out_unmap;
+
+found_dirent:
+ de = &block->dirents[slot];
+
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot,
+ ntohl(de->u.vnode), ntohl(de->u.unique),
+ name->name);
+
+ memset(de, 0, sizeof(*de) * need_slots);
+
+ /* Adjust the bitmap. */
+ afs_clear_contig_bits(block, slot, need_slots);
+ if (page != page0) {
+ unlock_page(page);
+ kunmap(page);
+ put_page(page);
+ }
+
+ /* Adjust the allocation counter. */
+ if (b < AFS_DIR_BLOCKS_WITH_CTR)
+ meta->meta.alloc_ctrs[b] += need_slots;
+
+ inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version);
+ afs_stat_v(vnode, n_dir_rm);
+ _debug("Remove %s from %u[%u]", name->name, b, slot);
+
+out_unmap:
+ unlock_page(page0);
+ kunmap(page0);
+ put_page(page0);
+ _leave("");
+ return;
+
+invalidated:
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
+ 0, 0, 0, 0, name->name);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ if (page != page0) {
+ unlock_page(page);
+ kunmap(page);
+ put_page(page);
+ }
+ goto out_unmap;
+
+error:
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error,
+ 0, 0, 0, 0, name->name);
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out_unmap;
+}
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
new file mode 100644
index 000000000000..983f3946ab57
--- /dev/null
+++ b/fs/afs/dynroot.c
@@ -0,0 +1,209 @@
+/* dir.c: AFS dynamic root handling
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/dns_resolver.h>
+#include "internal.h"
+
+const struct file_operations afs_dynroot_file_operations = {
+ .open = dcache_dir_open,
+ .release = dcache_dir_close,
+ .iterate_shared = dcache_readdir,
+ .llseek = dcache_dir_lseek,
+};
+
+/*
+ * Probe to see if a cell may exist. This prevents positive dentries from
+ * being created unnecessarily.
+ */
+static int afs_probe_cell_name(struct dentry *dentry)
+{
+ struct afs_cell *cell;
+ const char *name = dentry->d_name.name;
+ size_t len = dentry->d_name.len;
+ int ret;
+
+ /* Names prefixed with a dot are R/W mounts. */
+ if (name[0] == '.') {
+ if (len == 1)
+ return -EINVAL;
+ name++;
+ len--;
+ }
+
+ cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len);
+ if (!IS_ERR(cell)) {
+ afs_put_cell(afs_d2net(dentry), cell);
+ return 0;
+ }
+
+ ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL);
+ if (ret == -ENODATA)
+ ret = -EDESTADDRREQ;
+ return ret;
+}
+
+/*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
+{
+ struct afs_vnode *vnode = AFS_FS_I(dir);
+ struct inode *inode;
+ int ret = -ENOENT;
+
+ _enter("%p{%pd}, {%x:%u}",
+ dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
+
+ if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+ goto out;
+
+ ret = afs_probe_cell_name(dentry);
+ if (ret < 0)
+ goto out;
+
+ inode = afs_iget_pseudo_dir(dir->i_sb, false);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ _leave("= %p", inode);
+ return inode;
+
+out:
+ _leave("= %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Look up @cell in a dynroot directory. This is a substitution for the
+ * local cell name for the net namespace.
+ */
+static struct dentry *afs_lookup_atcell(struct dentry *dentry)
+{
+ struct afs_cell *cell;
+ struct afs_net *net = afs_d2net(dentry);
+ struct dentry *ret;
+ unsigned int seq = 0;
+ char *name;
+ int len;
+
+ if (!net->ws_cell)
+ return ERR_PTR(-ENOENT);
+
+ ret = ERR_PTR(-ENOMEM);
+ name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL);
+ if (!name)
+ goto out_p;
+
+ rcu_read_lock();
+ do {
+ read_seqbegin_or_lock(&net->cells_lock, &seq);
+ cell = rcu_dereference_raw(net->ws_cell);
+ if (cell) {
+ len = cell->name_len;
+ memcpy(name, cell->name, len + 1);
+ }
+ } while (need_seqretry(&net->cells_lock, seq));
+ done_seqretry(&net->cells_lock, seq);
+ rcu_read_unlock();
+
+ ret = ERR_PTR(-ENOENT);
+ if (!cell)
+ goto out_n;
+
+ ret = lookup_one_len(name, dentry->d_parent, len);
+
+ /* We don't want to d_add() the @cell dentry here as we don't want to
+ * the cached dentry to hide changes to the local cell name.
+ */
+
+out_n:
+ kfree(name);
+out_p:
+ return ret;
+}
+
+/*
+ * Look up an entry in a dynroot directory.
+ */
+static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct afs_vnode *vnode;
+ struct inode *inode;
+ int ret;
+
+ vnode = AFS_FS_I(dir);
+
+ _enter("%pd", dentry);
+
+ ASSERTCMP(d_inode(dentry), ==, NULL);
+
+ if (dentry->d_name.len >= AFSNAMEMAX) {
+ _leave(" = -ENAMETOOLONG");
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+
+ if (dentry->d_name.len == 5 &&
+ memcmp(dentry->d_name.name, "@cell", 5) == 0)
+ return afs_lookup_atcell(dentry);
+
+ inode = afs_try_auto_mntpt(dentry, dir);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret == -ENOENT) {
+ d_add(dentry, NULL);
+ _leave(" = NULL [negative]");
+ return NULL;
+ }
+ _leave(" = %d [do]", ret);
+ return ERR_PTR(ret);
+ }
+
+ d_add(dentry, inode);
+ _leave(" = 0 { ino=%lu v=%u }",
+ d_inode(dentry)->i_ino, d_inode(dentry)->i_generation);
+ return NULL;
+}
+
+const struct inode_operations afs_dynroot_inode_operations = {
+ .lookup = afs_dynroot_lookup,
+};
+
+/*
+ * Dirs in the dynamic root don't need revalidation.
+ */
+static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return 1;
+}
+
+/*
+ * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
+ * sleep)
+ * - called from dput() when d_count is going to 0.
+ * - return 1 to request dentry be unhashed, 0 otherwise
+ */
+static int afs_dynroot_d_delete(const struct dentry *dentry)
+{
+ return d_really_is_positive(dentry);
+}
+
+const struct dentry_operations afs_dynroot_dentry_operations = {
+ .d_revalidate = afs_dynroot_d_revalidate,
+ .d_delete = afs_dynroot_d_delete,
+ .d_release = afs_d_release,
+ .d_automount = afs_d_automount,
+};
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a39192ced99e..7d4f26198573 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -30,7 +30,6 @@ static int afs_readpages(struct file *filp, struct address_space *mapping,
const struct file_operations afs_file_operations = {
.open = afs_open,
- .flush = afs_flush,
.release = afs_release,
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
@@ -146,6 +145,9 @@ int afs_open(struct inode *inode, struct file *file)
if (ret < 0)
goto error_af;
}
+
+ if (file->f_flags & O_TRUNC)
+ set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
file->private_data = af;
_leave(" = 0");
@@ -170,6 +172,9 @@ int afs_release(struct inode *inode, struct file *file)
_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+ if ((file->f_mode & FMODE_WRITE))
+ return vfs_fsync(file, 0);
+
file->private_data = NULL;
if (af->wb)
afs_put_wb_key(af->wb);
@@ -187,10 +192,12 @@ void afs_put_read(struct afs_read *req)
{
int i;
- if (atomic_dec_and_test(&req->usage)) {
+ if (refcount_dec_and_test(&req->usage)) {
for (i = 0; i < req->nr_pages; i++)
if (req->pages[i])
put_page(req->pages[i]);
+ if (req->pages != req->array)
+ kfree(req->pages);
kfree(req);
}
}
@@ -231,7 +238,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_fetch_data(&fc, desc);
}
@@ -240,6 +247,12 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
ret = afs_end_vnode_operation(&fc);
}
+ if (ret == 0) {
+ afs_stat_v(vnode, n_fetches);
+ atomic_long_add(desc->actual_len,
+ &afs_v2net(vnode)->n_fetch_bytes);
+ }
+
_leave(" = %d", ret);
return ret;
}
@@ -297,10 +310,11 @@ int afs_page_filler(void *data, struct page *page)
* end of the file, the server will return a short read and the
* unmarshalling code will clear the unfilled space.
*/
- atomic_set(&req->usage, 1);
+ refcount_set(&req->usage, 1);
req->pos = (loff_t)page->index << PAGE_SHIFT;
req->len = PAGE_SIZE;
req->nr_pages = 1;
+ req->pages = req->array;
req->pages[0] = page;
get_page(page);
@@ -309,10 +323,6 @@ int afs_page_filler(void *data, struct page *page)
ret = afs_fetch_data(vnode, key, req);
afs_put_read(req);
- if (ret >= 0 && S_ISDIR(inode->i_mode) &&
- !afs_dir_check_page(inode, page))
- ret = -EIO;
-
if (ret < 0) {
if (ret == -ENOENT) {
_debug("got NOENT from server"
@@ -339,7 +349,8 @@ int afs_page_filler(void *data, struct page *page)
/* send the page to the cache */
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+ fscache_write_page(vnode->cache, page, vnode->status.size,
+ GFP_KERNEL) != 0) {
fscache_uncache_page(vnode->cache, page);
BUG_ON(PageFsCache(page));
}
@@ -403,7 +414,8 @@ static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
/* send the page to the cache */
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+ fscache_write_page(vnode->cache, page, vnode->status.size,
+ GFP_KERNEL) != 0) {
fscache_uncache_page(vnode->cache, page);
BUG_ON(PageFsCache(page));
}
@@ -445,10 +457,11 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
if (!req)
return -ENOMEM;
- atomic_set(&req->usage, 1);
+ refcount_set(&req->usage, 1);
req->page_done = afs_readpages_page_done;
req->pos = first->index;
req->pos <<= PAGE_SHIFT;
+ req->pages = req->array;
/* Transfer the pages to the request. We add them in until one fails
* to add to the LRU and then we stop (as that'll make a hole in the
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index c40ba2fe3cbe..dc62d15a964b 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -86,7 +86,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_set_lock(&fc, type);
}
@@ -117,7 +117,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, key)) {
while (afs_select_current_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_extend_lock(&fc);
}
@@ -148,7 +148,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, key)) {
while (afs_select_current_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_release_lock(&fc);
}
@@ -613,7 +613,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
posix_test_lock(file, fl);
if (fl->fl_type == F_UNLCK) {
/* no local locks; consult the server */
- ret = afs_fetch_status(vnode, key);
+ ret = afs_fetch_status(vnode, key, false);
if (ret < 0)
goto error;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 88ec38c2d83c..b273e1d60478 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -16,6 +16,7 @@
#include <linux/iversion.h>
#include "internal.h"
#include "afs_fs.h"
+#include "xdr_fs.h"
static const struct afs_fid afs_zero_fid;
@@ -44,109 +45,207 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
}
/*
- * decode an AFSFetchStatus block
+ * Dump a bad file status record.
*/
-static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
- struct afs_file_status *status,
- struct afs_vnode *vnode,
- afs_dataversion_t *store_version)
+static void xdr_dump_bad(const __be32 *bp)
{
- afs_dataversion_t expected_version;
- const __be32 *bp = *_bp;
+ __be32 x[4];
+ int i;
+
+ pr_notice("AFS XDR: Bad status record\n");
+ for (i = 0; i < 5 * 4 * 4; i += 16) {
+ memcpy(x, bp, 16);
+ bp += 4;
+ pr_notice("%03x: %08x %08x %08x %08x\n",
+ i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
+ }
+
+ memcpy(x, bp, 4);
+ pr_notice("0x50: %08x\n", ntohl(x[0]));
+}
+
+/*
+ * Update the core inode struct from a returned status record.
+ */
+void afs_update_inode_from_status(struct afs_vnode *vnode,
+ struct afs_file_status *status,
+ const afs_dataversion_t *expected_version,
+ u8 flags)
+{
+ struct timespec t;
umode_t mode;
+
+ t.tv_sec = status->mtime_client;
+ t.tv_nsec = 0;
+ vnode->vfs_inode.i_ctime = t;
+ vnode->vfs_inode.i_mtime = t;
+ vnode->vfs_inode.i_atime = t;
+
+ if (flags & (AFS_VNODE_META_CHANGED | AFS_VNODE_NOT_YET_SET)) {
+ vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
+ vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
+ set_nlink(&vnode->vfs_inode, status->nlink);
+
+ mode = vnode->vfs_inode.i_mode;
+ mode &= ~S_IALLUGO;
+ mode |= status->mode;
+ barrier();
+ vnode->vfs_inode.i_mode = mode;
+ }
+
+ if (!(flags & AFS_VNODE_NOT_YET_SET)) {
+ if (expected_version &&
+ *expected_version != status->data_version) {
+ _debug("vnode modified %llx on {%x:%u} [exp %llx]",
+ (unsigned long long) status->data_version,
+ vnode->fid.vid, vnode->fid.vnode,
+ (unsigned long long) *expected_version);
+ vnode->invalid_before = status->data_version;
+ if (vnode->status.type == AFS_FTYPE_DIR) {
+ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ afs_stat_v(vnode, n_inval);
+ } else {
+ set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+ }
+ } else if (vnode->status.type == AFS_FTYPE_DIR) {
+ /* Expected directory change is handled elsewhere so
+ * that we can locally edit the directory and save on a
+ * download.
+ */
+ if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ flags &= ~AFS_VNODE_DATA_CHANGED;
+ }
+ }
+
+ if (flags & (AFS_VNODE_DATA_CHANGED | AFS_VNODE_NOT_YET_SET)) {
+ inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+ i_size_write(&vnode->vfs_inode, status->size);
+ }
+}
+
+/*
+ * decode an AFSFetchStatus block
+ */
+static int xdr_decode_AFSFetchStatus(struct afs_call *call,
+ const __be32 **_bp,
+ struct afs_file_status *status,
+ struct afs_vnode *vnode,
+ const afs_dataversion_t *expected_version,
+ struct afs_read *read_req)
+{
+ const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp;
+ bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
u64 data_version, size;
- bool changed = false;
- kuid_t owner;
- kgid_t group;
+ u32 type, abort_code;
+ u8 flags = 0;
+ int ret;
if (vnode)
write_seqlock(&vnode->cb_lock);
-#define EXTRACT(DST) \
- do { \
- u32 x = ntohl(*bp++); \
- if (DST != x) \
- changed |= true; \
- DST = x; \
- } while (0)
-
- status->if_version = ntohl(*bp++);
- EXTRACT(status->type);
- EXTRACT(status->nlink);
- size = ntohl(*bp++);
- data_version = ntohl(*bp++);
- EXTRACT(status->author);
- owner = make_kuid(&init_user_ns, ntohl(*bp++));
- changed |= !uid_eq(owner, status->owner);
- status->owner = owner;
- EXTRACT(status->caller_access); /* call ticket dependent */
- EXTRACT(status->anon_access);
- EXTRACT(status->mode);
- bp++; /* parent.vnode */
- bp++; /* parent.unique */
- bp++; /* seg size */
- status->mtime_client = ntohl(*bp++);
- status->mtime_server = ntohl(*bp++);
- group = make_kgid(&init_user_ns, ntohl(*bp++));
- changed |= !gid_eq(group, status->group);
- status->group = group;
- bp++; /* sync counter */
- data_version |= (u64) ntohl(*bp++) << 32;
- EXTRACT(status->lock_count);
- size |= (u64) ntohl(*bp++) << 32;
- bp++; /* spare 4 */
- *_bp = bp;
+ abort_code = ntohl(xdr->abort_code);
+
+ if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) {
+ if (xdr->if_version == htonl(0) &&
+ abort_code != 0 &&
+ inline_error) {
+ /* The OpenAFS fileserver has a bug in FS.InlineBulkStatus
+ * whereby it doesn't set the interface version in the error
+ * case.
+ */
+ status->abort_code = abort_code;
+ ret = 0;
+ goto out;
+ }
- if (size != status->size) {
- status->size = size;
- changed |= true;
+ pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version));
+ goto bad;
}
- status->mode &= S_IALLUGO;
- _debug("vnode time %lx, %lx",
- status->mtime_client, status->mtime_server);
+ if (abort_code != 0 && inline_error) {
+ status->abort_code = abort_code;
+ ret = 0;
+ goto out;
+ }
- if (vnode) {
- if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
- _debug("vnode changed");
- i_size_write(&vnode->vfs_inode, size);
- vnode->vfs_inode.i_uid = status->owner;
- vnode->vfs_inode.i_gid = status->group;
- vnode->vfs_inode.i_generation = vnode->fid.unique;
- set_nlink(&vnode->vfs_inode, status->nlink);
-
- mode = vnode->vfs_inode.i_mode;
- mode &= ~S_IALLUGO;
- mode |= status->mode;
- barrier();
- vnode->vfs_inode.i_mode = mode;
+ type = ntohl(xdr->type);
+ switch (type) {
+ case AFS_FTYPE_FILE:
+ case AFS_FTYPE_DIR:
+ case AFS_FTYPE_SYMLINK:
+ if (type != status->type &&
+ vnode &&
+ !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+ pr_warning("Vnode %x:%x:%x changed type %u to %u\n",
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ status->type, type);
+ goto bad;
}
-
- vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client;
- vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime;
- vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime;
- inode_set_iversion_raw(&vnode->vfs_inode, data_version);
+ status->type = type;
+ break;
+ default:
+ goto bad;
}
- expected_version = status->data_version;
- if (store_version)
- expected_version = *store_version;
+#define EXTRACT_M(FIELD) \
+ do { \
+ u32 x = ntohl(xdr->FIELD); \
+ if (status->FIELD != x) { \
+ flags |= AFS_VNODE_META_CHANGED; \
+ status->FIELD = x; \
+ } \
+ } while (0)
- if (expected_version != data_version) {
- status->data_version = data_version;
- if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
- _debug("vnode modified %llx on {%x:%u}",
- (unsigned long long) data_version,
- vnode->fid.vid, vnode->fid.vnode);
- set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
- set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
- }
- } else if (store_version) {
+ EXTRACT_M(nlink);
+ EXTRACT_M(author);
+ EXTRACT_M(owner);
+ EXTRACT_M(caller_access); /* call ticket dependent */
+ EXTRACT_M(anon_access);
+ EXTRACT_M(mode);
+ EXTRACT_M(group);
+
+ status->mtime_client = ntohl(xdr->mtime_client);
+ status->mtime_server = ntohl(xdr->mtime_server);
+ status->lock_count = ntohl(xdr->lock_count);
+
+ size = (u64)ntohl(xdr->size_lo);
+ size |= (u64)ntohl(xdr->size_hi) << 32;
+ status->size = size;
+
+ data_version = (u64)ntohl(xdr->data_version_lo);
+ data_version |= (u64)ntohl(xdr->data_version_hi) << 32;
+ if (data_version != status->data_version) {
status->data_version = data_version;
+ flags |= AFS_VNODE_DATA_CHANGED;
+ }
+
+ if (read_req) {
+ read_req->data_version = data_version;
+ read_req->file_size = size;
+ }
+
+ *_bp = (const void *)*_bp + sizeof(*xdr);
+
+ if (vnode) {
+ if (test_bit(AFS_VNODE_UNSET, &vnode->flags))
+ flags |= AFS_VNODE_NOT_YET_SET;
+ afs_update_inode_from_status(vnode, status, expected_version,
+ flags);
}
+ ret = 0;
+
+out:
if (vnode)
write_sequnlock(&vnode->cb_lock);
+ return ret;
+
+bad:
+ xdr_dump_bad(*_bp);
+ ret = afs_protocol_error(call, -EBADMSG);
+ goto out;
}
/*
@@ -162,7 +261,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call,
write_seqlock(&vnode->cb_lock);
- if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) {
+ if (call->cb_break == afs_cb_break_sum(vnode, cbi)) {
vnode->cb_version = ntohl(*bp++);
cb_expiry = ntohl(*bp++);
vnode->cb_type = ntohl(*bp++);
@@ -274,7 +373,7 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
/*
* deliver reply data to an FS.FetchStatus
*/
-static int afs_deliver_fs_fetch_status(struct afs_call *call)
+static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
{
struct afs_vnode *vnode = call->reply[0];
const __be32 *bp;
@@ -288,7 +387,9 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
xdr_decode_AFSCallBack(call, vnode, &bp);
if (call->reply[1])
xdr_decode_AFSVolSync(&bp, call->reply[1]);
@@ -300,17 +401,18 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
/*
* FS.FetchStatus operation type
*/
-static const struct afs_call_type afs_RXFSFetchStatus = {
- .name = "FS.FetchStatus",
+static const struct afs_call_type afs_RXFSFetchStatus_vnode = {
+ .name = "FS.FetchStatus(vnode)",
.op = afs_FS_FetchStatus,
- .deliver = afs_deliver_fs_fetch_status,
+ .deliver = afs_deliver_fs_fetch_status_vnode,
.destructor = afs_flat_call_destructor,
};
/*
* fetch the status information for a file
*/
-int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync)
+int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync,
+ bool new_inode)
{
struct afs_vnode *vnode = fc->vnode;
struct afs_call *call;
@@ -320,7 +422,8 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
_enter(",%x,{%x:%u},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
- call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+ call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode,
+ 16, (21 + 3 + 6) * 4);
if (!call) {
fc->ac.error = -ENOMEM;
return -ENOMEM;
@@ -329,6 +432,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
call->key = fc->key;
call->reply[0] = vnode;
call->reply[1] = volsync;
+ call->expected_version = new_inode ? 1 : vnode->status.data_version;
/* marshall the parameters */
bp = call->request;
@@ -464,7 +568,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
return ret;
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &vnode->status.data_version, req) < 0)
+ return afs_protocol_error(call, -EBADMSG);
xdr_decode_AFSCallBack(call, vnode, &bp);
if (call->reply[1])
xdr_decode_AFSVolSync(&bp, call->reply[1]);
@@ -534,6 +640,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
call->reply[0] = vnode;
call->reply[1] = NULL; /* volsync */
call->reply[2] = req;
+ call->expected_version = vnode->status.data_version;
/* marshall the parameters */
bp = call->request;
@@ -546,7 +653,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len));
- atomic_inc(&req->usage);
+ refcount_inc(&req->usage);
call->cb_break = fc->cb_break;
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
@@ -578,6 +685,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
call->reply[0] = vnode;
call->reply[1] = NULL; /* volsync */
call->reply[2] = req;
+ call->expected_version = vnode->status.data_version;
/* marshall the parameters */
bp = call->request;
@@ -588,7 +696,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
bp[4] = htonl(lower_32_bits(req->pos));
bp[5] = htonl(lower_32_bits(req->len));
- atomic_inc(&req->usage);
+ refcount_inc(&req->usage);
call->cb_break = fc->cb_break;
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
@@ -613,8 +721,10 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply[1]);
- xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL);
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 ||
+ xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
xdr_decode_AFSCallBack_raw(&bp, call->reply[3]);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
@@ -645,6 +755,7 @@ static const struct afs_call_type afs_RXFSMakeDir = {
int afs_fs_create(struct afs_fs_cursor *fc,
const char *name,
umode_t mode,
+ u64 current_data_version,
struct afs_fid *newfid,
struct afs_file_status *newstatus,
struct afs_callback *newcb)
@@ -672,6 +783,7 @@ int afs_fs_create(struct afs_fs_cursor *fc,
call->reply[1] = newfid;
call->reply[2] = newstatus;
call->reply[3] = newcb;
+ call->expected_version = current_data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -715,7 +827,9 @@ static int afs_deliver_fs_remove(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -742,7 +856,8 @@ static const struct afs_call_type afs_RXFSRemoveDir = {
/*
* remove a file or directory
*/
-int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir)
+int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir,
+ u64 current_data_version)
{
struct afs_vnode *vnode = fc->vnode;
struct afs_call *call;
@@ -764,6 +879,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir)
call->key = fc->key;
call->reply[0] = vnode;
+ call->expected_version = current_data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -801,8 +917,10 @@ static int afs_deliver_fs_link(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
- xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 ||
+ xdr_decode_AFSFetchStatus(call, &bp, &dvnode->status, dvnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -823,7 +941,7 @@ static const struct afs_call_type afs_RXFSLink = {
* make a hard link
*/
int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
- const char *name)
+ const char *name, u64 current_data_version)
{
struct afs_vnode *dvnode = fc->vnode;
struct afs_call *call;
@@ -844,6 +962,7 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
call->key = fc->key;
call->reply[0] = dvnode;
call->reply[1] = vnode;
+ call->expected_version = current_data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -885,8 +1004,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply[1]);
- xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL);
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) ||
+ xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -909,6 +1030,7 @@ static const struct afs_call_type afs_RXFSSymlink = {
int afs_fs_symlink(struct afs_fs_cursor *fc,
const char *name,
const char *contents,
+ u64 current_data_version,
struct afs_fid *newfid,
struct afs_file_status *newstatus)
{
@@ -937,6 +1059,7 @@ int afs_fs_symlink(struct afs_fs_cursor *fc,
call->reply[0] = vnode;
call->reply[1] = newfid;
call->reply[2] = newstatus;
+ call->expected_version = current_data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -987,10 +1110,13 @@ static int afs_deliver_fs_rename(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL);
- if (new_dvnode != orig_dvnode)
- xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
- NULL);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &orig_dvnode->status, orig_dvnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
+ if (new_dvnode != orig_dvnode &&
+ xdr_decode_AFSFetchStatus(call, &bp, &new_dvnode->status, new_dvnode,
+ &call->expected_version_2, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1013,7 +1139,9 @@ static const struct afs_call_type afs_RXFSRename = {
int afs_fs_rename(struct afs_fs_cursor *fc,
const char *orig_name,
struct afs_vnode *new_dvnode,
- const char *new_name)
+ const char *new_name,
+ u64 current_orig_data_version,
+ u64 current_new_data_version)
{
struct afs_vnode *orig_dvnode = fc->vnode;
struct afs_call *call;
@@ -1041,6 +1169,8 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
call->key = fc->key;
call->reply[0] = orig_dvnode;
call->reply[1] = new_dvnode;
+ call->expected_version = current_orig_data_version + 1;
+ call->expected_version_2 = current_new_data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -1089,8 +1219,9 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
- &call->store_version);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
afs_pages_written_back(vnode, call);
@@ -1147,7 +1278,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
call->first_offset = offset;
call->last_to = to;
call->send_pages = true;
- call->store_version = vnode->status.data_version + 1;
+ call->expected_version = vnode->status.data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -1222,7 +1353,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
call->first_offset = offset;
call->last_to = to;
call->send_pages = true;
- call->store_version = vnode->status.data_version + 1;
+ call->expected_version = vnode->status.data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -1252,7 +1383,6 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
*/
static int afs_deliver_fs_store_status(struct afs_call *call)
{
- afs_dataversion_t *store_version;
struct afs_vnode *vnode = call->reply[0];
const __be32 *bp;
int ret;
@@ -1264,12 +1394,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
return ret;
/* unmarshall the reply once we've received all of it */
- store_version = NULL;
- if (call->operation_ID == FSSTOREDATA)
- store_version = &call->store_version;
-
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
+ if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode,
+ &call->expected_version, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
_leave(" = 0 [done]");
@@ -1324,7 +1452,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
call->key = fc->key;
call->reply[0] = vnode;
- call->store_version = vnode->status.data_version + 1;
+ call->expected_version = vnode->status.data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -1373,7 +1501,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
call->key = fc->key;
call->reply[0] = vnode;
- call->store_version = vnode->status.data_version + 1;
+ call->expected_version = vnode->status.data_version + 1;
/* marshall the parameters */
bp = call->request;
@@ -1418,6 +1546,7 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
call->key = fc->key;
call->reply[0] = vnode;
+ call->expected_version = vnode->status.data_version;
/* marshall the parameters */
bp = call->request;
@@ -1471,7 +1600,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
call->count = ntohl(call->tmp);
_debug("volname length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
call->offset = 0;
call->unmarshall++;
@@ -1518,7 +1647,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
call->count = ntohl(call->tmp);
_debug("offline msg length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
call->offset = 0;
call->unmarshall++;
@@ -1565,7 +1694,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
call->count = ntohl(call->tmp);
_debug("motd length: %u", call->count);
if (call->count >= AFSNAMEMAX)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
call->offset = 0;
call->unmarshall++;
@@ -1947,3 +2076,265 @@ int afs_fs_get_capabilities(struct afs_net *net,
trace_afs_make_fs_call(call, NULL);
return afs_make_call(ac, call, GFP_NOFS, false);
}
+
+/*
+ * Deliver reply data to an FS.FetchStatus with no vnode.
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call)
+{
+ struct afs_file_status *status = call->reply[1];
+ struct afs_callback *callback = call->reply[2];
+ struct afs_volsync *volsync = call->reply[3];
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ int ret;
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(call, &bp, status, vnode,
+ &call->expected_version, NULL);
+ callback[call->count].version = ntohl(bp[0]);
+ callback[call->count].expiry = ntohl(bp[1]);
+ callback[call->count].type = ntohl(bp[2]);
+ if (vnode)
+ xdr_decode_AFSCallBack(call, vnode, &bp);
+ else
+ bp += 3;
+ if (volsync)
+ xdr_decode_AFSVolSync(&bp, volsync);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+ .name = "FS.FetchStatus",
+ .op = afs_FS_FetchStatus,
+ .deliver = afs_deliver_fs_fetch_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for a fid without needing a vnode handle.
+ */
+int afs_fs_fetch_status(struct afs_fs_cursor *fc,
+ struct afs_net *net,
+ struct afs_fid *fid,
+ struct afs_file_status *status,
+ struct afs_callback *callback,
+ struct afs_volsync *volsync)
+{
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter(",%x,{%x:%u},,",
+ key_serial(fc->key), fid->vid, fid->vnode);
+
+ call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[1] = status;
+ call->reply[2] = callback;
+ call->reply[3] = volsync;
+ call->expected_version = 1; /* vnode->status.data_version */
+
+ /* marshall the parameters */
+ bp = call->request;
+ bp[0] = htonl(FSFETCHSTATUS);
+ bp[1] = htonl(fid->vid);
+ bp[2] = htonl(fid->vnode);
+ bp[3] = htonl(fid->unique);
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, fid);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.InlineBulkStatus call
+ */
+static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
+{
+ struct afs_file_status *statuses;
+ struct afs_callback *callbacks;
+ struct afs_vnode *vnode = call->reply[0];
+ const __be32 *bp;
+ u32 tmp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ switch (call->unmarshall) {
+ case 0:
+ call->offset = 0;
+ call->unmarshall++;
+
+ /* Extract the file status count and array in two steps */
+ case 1:
+ _debug("extract status count");
+ ret = afs_extract_data(call, &call->tmp, 4, true);
+ if (ret < 0)
+ return ret;
+
+ tmp = ntohl(call->tmp);
+ _debug("status count: %u/%u", tmp, call->count2);
+ if (tmp != call->count2)
+ return afs_protocol_error(call, -EBADMSG);
+
+ call->count = 0;
+ call->unmarshall++;
+ more_counts:
+ call->offset = 0;
+
+ case 2:
+ _debug("extract status array %u", call->count);
+ ret = afs_extract_data(call, call->buffer, 21 * 4, true);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ statuses = call->reply[1];
+ if (xdr_decode_AFSFetchStatus(call, &bp, &statuses[call->count],
+ call->count == 0 ? vnode : NULL,
+ NULL, NULL) < 0)
+ return afs_protocol_error(call, -EBADMSG);
+
+ call->count++;
+ if (call->count < call->count2)
+ goto more_counts;
+
+ call->count = 0;
+ call->unmarshall++;
+ call->offset = 0;
+
+ /* Extract the callback count and array in two steps */
+ case 3:
+ _debug("extract CB count");
+ ret = afs_extract_data(call, &call->tmp, 4, true);
+ if (ret < 0)
+ return ret;
+
+ tmp = ntohl(call->tmp);
+ _debug("CB count: %u", tmp);
+ if (tmp != call->count2)
+ return afs_protocol_error(call, -EBADMSG);
+ call->count = 0;
+ call->unmarshall++;
+ more_cbs:
+ call->offset = 0;
+
+ case 4:
+ _debug("extract CB array");
+ ret = afs_extract_data(call, call->buffer, 3 * 4, true);
+ if (ret < 0)
+ return ret;
+
+ _debug("unmarshall CB array");
+ bp = call->buffer;
+ callbacks = call->reply[2];
+ callbacks[call->count].version = ntohl(bp[0]);
+ callbacks[call->count].expiry = ntohl(bp[1]);
+ callbacks[call->count].type = ntohl(bp[2]);
+ statuses = call->reply[1];
+ if (call->count == 0 && vnode && statuses[0].abort_code == 0)
+ xdr_decode_AFSCallBack(call, vnode, &bp);
+ call->count++;
+ if (call->count < call->count2)
+ goto more_cbs;
+
+ call->offset = 0;
+ call->unmarshall++;
+
+ case 5:
+ ret = afs_extract_data(call, call->buffer, 6 * 4, false);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ if (call->reply[3])
+ xdr_decode_AFSVolSync(&bp, call->reply[3]);
+
+ call->offset = 0;
+ call->unmarshall++;
+
+ case 6:
+ break;
+ }
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.InlineBulkStatus operation type
+ */
+static const struct afs_call_type afs_RXFSInlineBulkStatus = {
+ .name = "FS.InlineBulkStatus",
+ .op = afs_FS_InlineBulkStatus,
+ .deliver = afs_deliver_fs_inline_bulk_status,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Fetch the status information for up to 50 files
+ */
+int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
+ struct afs_net *net,
+ struct afs_fid *fids,
+ struct afs_file_status *statuses,
+ struct afs_callback *callbacks,
+ unsigned int nr_fids,
+ struct afs_volsync *volsync)
+{
+ struct afs_call *call;
+ __be32 *bp;
+ int i;
+
+ _enter(",%x,{%x:%u},%u",
+ key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids);
+
+ call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus,
+ (2 + nr_fids * 3) * 4,
+ 21 * 4);
+ if (!call) {
+ fc->ac.error = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ call->key = fc->key;
+ call->reply[0] = NULL; /* vnode for fid[0] */
+ call->reply[1] = statuses;
+ call->reply[2] = callbacks;
+ call->reply[3] = volsync;
+ call->count2 = nr_fids;
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(FSINLINEBULKSTATUS);
+ *bp++ = htonl(nr_fids);
+ for (i = 0; i < nr_fids; i++) {
+ *bp++ = htonl(fids[i].vid);
+ *bp++ = htonl(fids[i].vnode);
+ *bp++ = htonl(fids[i].unique);
+ }
+
+ call->cb_break = fc->cb_break;
+ afs_use_fs_server(call, fc->cbi);
+ trace_afs_make_fs_call(call, &fids[0]);
+ return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6b39d0255b72..479b7fdda124 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -30,12 +30,11 @@ static const struct inode_operations afs_symlink_inode_operations = {
};
/*
- * map the AFS file status to the inode member variables
+ * Initialise an inode from the vnode status.
*/
-static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
+static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key)
{
struct inode *inode = AFS_VNODE_TO_I(vnode);
- bool changed;
_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
vnode->status.type,
@@ -46,16 +45,21 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
read_seqlock_excl(&vnode->cb_lock);
+ afs_update_inode_from_status(vnode, &vnode->status, NULL,
+ AFS_VNODE_NOT_YET_SET);
+
switch (vnode->status.type) {
case AFS_FTYPE_FILE:
inode->i_mode = S_IFREG | vnode->status.mode;
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
+ inode->i_mapping->a_ops = &afs_fs_aops;
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | vnode->status.mode;
inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations;
+ inode->i_mapping->a_ops = &afs_dir_aops;
break;
case AFS_FTYPE_SYMLINK:
/* Symlinks with a mode of 0644 are actually mountpoints. */
@@ -67,45 +71,31 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
inode->i_mode = S_IFDIR | 0555;
inode->i_op = &afs_mntpt_inode_operations;
inode->i_fop = &afs_mntpt_file_operations;
+ inode->i_mapping->a_ops = &afs_fs_aops;
} else {
inode->i_mode = S_IFLNK | vnode->status.mode;
inode->i_op = &afs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &afs_fs_aops;
}
inode_nohighmem(inode);
break;
default:
printk("kAFS: AFS vnode with undefined type\n");
read_sequnlock_excl(&vnode->cb_lock);
- return -EBADMSG;
+ return afs_protocol_error(NULL, -EBADMSG);
}
- changed = (vnode->status.size != inode->i_size);
-
- set_nlink(inode, vnode->status.nlink);
- inode->i_uid = vnode->status.owner;
- inode->i_gid = vnode->status.group;
- inode->i_size = vnode->status.size;
- inode->i_ctime.tv_sec = vnode->status.mtime_client;
- inode->i_ctime.tv_nsec = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime;
inode->i_blocks = 0;
- inode->i_generation = vnode->fid.unique;
- inode_set_iversion_raw(inode, vnode->status.data_version);
- inode->i_mapping->a_ops = &afs_fs_aops;
+ vnode->invalid_before = vnode->status.data_version;
read_sequnlock_excl(&vnode->cb_lock);
-
-#ifdef CONFIG_AFS_FSCACHE
- if (changed)
- fscache_attr_changed(vnode->cache);
-#endif
return 0;
}
/*
* Fetch file status from the volume.
*/
-int afs_fetch_status(struct afs_vnode *vnode, struct key *key)
+int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
{
struct afs_fs_cursor fc;
int ret;
@@ -118,8 +108,8 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
- afs_fs_fetch_file_status(&fc, NULL);
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
+ afs_fs_fetch_file_status(&fc, NULL, new_inode);
}
afs_check_for_remote_deletion(&fc, fc.vnode);
@@ -243,6 +233,38 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
}
/*
+ * Get a cache cookie for an inode.
+ */
+static void afs_get_inode_cache(struct afs_vnode *vnode)
+{
+#ifdef CONFIG_AFS_FSCACHE
+ struct {
+ u32 vnode_id;
+ u32 unique;
+ u32 vnode_id_ext[2]; /* Allow for a 96-bit key */
+ } __packed key;
+ struct afs_vnode_cache_aux aux;
+
+ if (vnode->status.type == AFS_FTYPE_DIR) {
+ vnode->cache = NULL;
+ return;
+ }
+
+ key.vnode_id = vnode->fid.vnode;
+ key.unique = vnode->fid.unique;
+ key.vnode_id_ext[0] = 0;
+ key.vnode_id_ext[1] = 0;
+ aux.data_version = vnode->status.data_version;
+
+ vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+ &afs_vnode_cache_index_def,
+ &key, sizeof(key),
+ &aux, sizeof(aux),
+ vnode, vnode->status.size, true);
+#endif
+}
+
+/*
* inode retrieval
*/
struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -280,7 +302,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
if (!status) {
/* it's a remotely extant inode */
- ret = afs_fetch_status(vnode, key);
+ ret = afs_fetch_status(vnode, key, true);
if (ret < 0)
goto bad_inode;
} else {
@@ -304,19 +326,12 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
vnode->cb_expires_at += ktime_get_real_seconds();
}
- /* set up caching before mapping the status, as map-status reads the
- * first page of symlinks to see if they're really mountpoints */
- inode->i_size = vnode->status.size;
-#ifdef CONFIG_AFS_FSCACHE
- vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
- &afs_vnode_cache_index_def,
- vnode, true);
-#endif
-
- ret = afs_inode_map_status(vnode, key);
+ ret = afs_inode_init_from_status(vnode, key);
if (ret < 0)
goto bad_inode;
+ afs_get_inode_cache(vnode);
+
/* success */
clear_bit(AFS_VNODE_UNSET, &vnode->flags);
inode->i_flags |= S_NOATIME;
@@ -326,10 +341,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
/* failure */
bad_inode:
-#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(vnode->cache, 0);
- vnode->cache = NULL;
-#endif
iget_failed(inode);
_leave(" = %d [bad]", ret);
return ERR_PTR(ret);
@@ -343,6 +354,10 @@ void afs_zap_data(struct afs_vnode *vnode)
{
_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_invalidate(vnode->cache);
+#endif
+
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
* directory or symlink */
@@ -378,12 +393,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
read_seqlock_excl(&vnode->cb_lock);
if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) {
+ if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break ||
+ vnode->cb_v_break != vnode->volume->cb_v_break) {
vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
- } else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) &&
- !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+ vnode->cb_v_break = vnode->volume->cb_v_break;
+ valid = false;
+ } else if (vnode->status.type == AFS_FTYPE_DIR &&
+ test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) &&
vnode->cb_expires_at - 10 > now) {
- valid = true;
+ valid = true;
+ } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+ vnode->cb_expires_at - 10 > now) {
+ valid = true;
}
} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
valid = true;
@@ -397,7 +418,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
if (valid)
goto valid;
- mutex_lock(&vnode->validate_lock);
+ down_write(&vnode->validate_lock);
/* if the promise has expired, we need to check the server again to get
* a new promise - note that if the (parent) directory's metadata was
@@ -405,7 +426,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
* access */
if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
_debug("not promised");
- ret = afs_fetch_status(vnode, key);
+ ret = afs_fetch_status(vnode, key, false);
if (ret < 0) {
if (ret == -ENOENT) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
@@ -426,15 +447,13 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
* different */
if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
afs_zap_data(vnode);
-
- clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
- mutex_unlock(&vnode->validate_lock);
+ up_write(&vnode->validate_lock);
valid:
_leave(" = 0");
return 0;
error_unlock:
- mutex_unlock(&vnode->validate_lock);
+ up_write(&vnode->validate_lock);
_leave(" = %d", ret);
return ret;
}
@@ -507,11 +526,17 @@ void afs_evict_inode(struct inode *inode)
}
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(vnode->cache, 0);
- vnode->cache = NULL;
+ {
+ struct afs_vnode_cache_aux aux;
+
+ aux.data_version = vnode->status.data_version;
+ fscache_relinquish_cookie(vnode->cache, &aux,
+ test_bit(AFS_VNODE_DELETED, &vnode->flags));
+ vnode->cache = NULL;
+ }
#endif
- afs_put_permits(vnode->permit_cache);
+ afs_put_permits(rcu_access_pointer(vnode->permit_cache));
_leave("");
}
@@ -552,7 +577,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_setattr(&fc, attr);
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 72217170b155..e3f8a46663db 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -122,7 +122,8 @@ struct afs_call {
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
__be32 tmp; /* place to extract temporary data */
- afs_dataversion_t store_version; /* updated version expected from store */
+ afs_dataversion_t expected_version; /* Updated version expected from store */
+ afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */
};
struct afs_call_type {
@@ -173,11 +174,14 @@ struct afs_read {
loff_t len; /* How much we're asking for */
loff_t actual_len; /* How much we're actually getting */
loff_t remain; /* Amount remaining */
- atomic_t usage;
+ loff_t file_size; /* File size returned by server */
+ afs_dataversion_t data_version; /* Version number returned by server */
+ refcount_t usage;
unsigned int index; /* Which page we're reading into */
unsigned int nr_pages;
void (*page_done)(struct afs_call *, struct afs_read *);
- struct page *pages[];
+ struct page **pages;
+ struct page *array[];
};
/*
@@ -199,6 +203,18 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
extern struct file_system_type afs_fs_type;
/*
+ * Set of substitutes for @sys.
+ */
+struct afs_sysnames {
+#define AFS_NR_SYSNAME 16
+ char *subs[AFS_NR_SYSNAME];
+ refcount_t usage;
+ unsigned short nr;
+ short error;
+ char blank[1];
+};
+
+/*
* AFS network namespace record.
*/
struct afs_net {
@@ -245,9 +261,25 @@ struct afs_net {
struct mutex lock_manager_mutex;
/* Misc */
- struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */
+ struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */
+ struct afs_sysnames *sysnames;
+ rwlock_t sysnames_lock;
+
+ /* Statistics counters */
+ atomic_t n_lookup; /* Number of lookups done */
+ atomic_t n_reval; /* Number of dentries needing revalidation */
+ atomic_t n_inval; /* Number of invalidations by the server */
+ atomic_t n_relpg; /* Number of invalidations by releasepage */
+ atomic_t n_read_dir; /* Number of directory pages read */
+ atomic_t n_dir_cr; /* Number of directory entry creation edits */
+ atomic_t n_dir_rm; /* Number of directory entry removal edits */
+ atomic_t n_stores; /* Number of store ops */
+ atomic_long_t n_store_bytes; /* Number of bytes stored */
+ atomic_long_t n_fetch_bytes; /* Number of bytes fetched */
+ atomic_t n_fetches; /* Number of data fetch ops */
};
+extern const char afs_init_sysname[];
extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns
enum afs_cell_state {
@@ -363,6 +395,8 @@ struct afs_server {
#define AFS_SERVER_FL_UPDATING 4
#define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */
#define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */
+#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */
+#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
atomic_t usage;
u32 addr_version; /* Address list version */
@@ -400,6 +434,7 @@ struct afs_server_list {
unsigned short index; /* Server currently in use */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
+ rwlock_t lock;
struct afs_server_entry servers[];
};
@@ -426,6 +461,9 @@ struct afs_volume {
rwlock_t servers_lock; /* Lock for ->servers */
unsigned int servers_seq; /* Incremented each time ->servers changes */
+ unsigned cb_v_break; /* Break-everything counter. */
+ rwlock_t cb_break_lock;
+
afs_voltype_t type; /* type of volume */
short error;
char type_force; /* force volume type (suppress R/O -> R/W) */
@@ -455,23 +493,25 @@ struct afs_vnode {
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
+ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_cookie *cache; /* caching cookie */
#endif
- struct afs_permits *permit_cache; /* cache of permits so far obtained */
+ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */
struct mutex io_lock; /* Lock for serialising I/O on this mutex */
- struct mutex validate_lock; /* lock for validating this vnode */
+ struct rw_semaphore validate_lock; /* lock for validating this vnode */
spinlock_t wb_lock; /* lock for wb_keys */
spinlock_t lock; /* waitqueue/flags lock */
unsigned long flags;
#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */
#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
-#define AFS_VNODE_DIR_MODIFIED 2 /* set if dir vnode's data modified */
+#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
+#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */
@@ -484,6 +524,7 @@ struct afs_vnode {
/* outstanding callback notification on this file */
struct afs_cb_interest *cb_interest; /* Server on which this resides */
unsigned int cb_s_break; /* Mass break counter on ->server */
+ unsigned int cb_v_break; /* Mass break counter on ->volume */
unsigned int cb_break; /* Break counter on vnode */
seqlock_t cb_lock; /* Lock for ->cb_interest, ->status, ->cb_*break */
@@ -559,6 +600,13 @@ struct afs_fs_cursor {
#define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */
};
+/*
+ * Cache auxiliary data.
+ */
+struct afs_vnode_cache_aux {
+ u64 data_version;
+} __packed;
+
#include <trace/events/afs.h>
/*****************************************************************************/
@@ -604,18 +652,31 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
*/
extern void afs_init_callback_state(struct afs_server *);
extern void afs_break_callback(struct afs_vnode *);
-extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]);
+extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*);
-extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *);
+extern int afs_register_server_cb_interest(struct afs_vnode *,
+ struct afs_server_list *, unsigned int);
extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *);
extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *);
static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi)
{
- refcount_inc(&cbi->usage);
+ if (cbi)
+ refcount_inc(&cbi->usage);
return cbi;
}
+static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
+{
+ return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break;
+}
+
+static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode,
+ struct afs_cb_interest *cbi)
+{
+ return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break;
+}
+
/*
* cell.c
*/
@@ -639,11 +700,26 @@ extern bool afs_cm_incoming_call(struct afs_call *);
*/
extern const struct file_operations afs_dir_file_operations;
extern const struct inode_operations afs_dir_inode_operations;
+extern const struct address_space_operations afs_dir_aops;
+extern const struct dentry_operations afs_fs_dentry_operations;
+
+extern void afs_d_release(struct dentry *);
+
+/*
+ * dir_edit.c
+ */
+extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
+ enum afs_edit_dir_reason);
+extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+
+/*
+ * dynroot.c
+ */
extern const struct file_operations afs_dynroot_file_operations;
extern const struct inode_operations afs_dynroot_inode_operations;
-extern const struct dentry_operations afs_fs_dentry_operations;
+extern const struct dentry_operations afs_dynroot_dentry_operations;
-extern bool afs_dir_check_page(struct inode *, struct page *);
+extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *);
/*
* file.c
@@ -673,17 +749,23 @@ extern int afs_flock(struct file *, int, struct file_lock *);
/*
* fsclient.c
*/
-extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *);
+#define AFS_VNODE_NOT_YET_SET 0x01
+#define AFS_VNODE_META_CHANGED 0x02
+#define AFS_VNODE_DATA_CHANGED 0x04
+extern void afs_update_inode_from_status(struct afs_vnode *, struct afs_file_status *,
+ const afs_dataversion_t *, u8);
+
+extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool);
extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
-extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t,
+extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64,
struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool);
-extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *);
-extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64);
+extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
+extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
struct afs_fid *, struct afs_file_status *);
extern int afs_fs_rename(struct afs_fs_cursor *, const char *,
- struct afs_vnode *, const char *);
+ struct afs_vnode *, const char *, u64, u64);
extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
pgoff_t, pgoff_t, unsigned, unsigned);
extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
@@ -695,11 +777,18 @@ extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
struct afs_addr_cursor *, struct key *);
extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
struct afs_addr_cursor *, struct key *);
+extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, unsigned int,
+ struct afs_volsync *);
+extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
+ struct afs_fid *, struct afs_file_status *,
+ struct afs_callback *, struct afs_volsync *);
/*
* inode.c
*/
-extern int afs_fetch_status(struct afs_vnode *, struct key *);
+extern int afs_fetch_status(struct afs_vnode *, struct key *, bool);
extern int afs_iget5_test(struct inode *, void *);
extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct super_block *, struct key *,
@@ -747,6 +836,13 @@ static inline void afs_put_net(struct afs_net *net)
{
}
+static inline void __afs_stat(atomic_t *s)
+{
+ atomic_inc(s);
+}
+
+#define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n)
+
/*
* misc.c
*/
@@ -774,6 +870,7 @@ extern int __net_init afs_proc_init(struct afs_net *);
extern void __net_exit afs_proc_cleanup(struct afs_net *);
extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *);
extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *);
+extern void afs_put_sysnames(struct afs_sysnames *);
/*
* rotate.c
@@ -802,6 +899,7 @@ extern void afs_flat_call_destructor(struct afs_call *);
extern void afs_send_empty_reply(struct afs_call *);
extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
extern int afs_extract_data(struct afs_call *, void *, size_t, bool);
+extern int afs_protocol_error(struct afs_call *, int);
static inline int afs_transfer_reply(struct afs_call *call)
{
@@ -948,7 +1046,6 @@ extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
-extern int afs_flush(struct file *, fl_owner_t);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern int afs_page_mkwrite(struct vm_fault *);
extern void afs_prune_wb_keys(struct afs_vnode *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 15a02a05ff40..d7560168b3bf 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -34,11 +34,42 @@ MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
struct workqueue_struct *afs_wq;
struct afs_net __afs_net;
+#if defined(CONFIG_ALPHA)
+const char afs_init_sysname[] = "alpha_linux26";
+#elif defined(CONFIG_X86_64)
+const char afs_init_sysname[] = "amd64_linux26";
+#elif defined(CONFIG_ARM)
+const char afs_init_sysname[] = "arm_linux26";
+#elif defined(CONFIG_ARM64)
+const char afs_init_sysname[] = "aarch64_linux26";
+#elif defined(CONFIG_X86_32)
+const char afs_init_sysname[] = "i386_linux26";
+#elif defined(CONFIG_IA64)
+const char afs_init_sysname[] = "ia64_linux26";
+#elif defined(CONFIG_PPC64)
+const char afs_init_sysname[] = "ppc64_linux26";
+#elif defined(CONFIG_PPC32)
+const char afs_init_sysname[] = "ppc_linux26";
+#elif defined(CONFIG_S390)
+#ifdef CONFIG_64BIT
+const char afs_init_sysname[] = "s390x_linux26";
+#else
+const char afs_init_sysname[] = "s390_linux26";
+#endif
+#elif defined(CONFIG_SPARC64)
+const char afs_init_sysname[] = "sparc64_linux26";
+#elif defined(CONFIG_SPARC32)
+const char afs_init_sysname[] = "sparc_linux26";
+#else
+const char afs_init_sysname[] = "unknown_linux26";
+#endif
+
/*
* Initialise an AFS network namespace record.
*/
static int __net_init afs_net_init(struct afs_net *net)
{
+ struct afs_sysnames *sysnames;
int ret;
net->live = true;
@@ -67,6 +98,16 @@ static int __net_init afs_net_init(struct afs_net *net)
INIT_WORK(&net->fs_manager, afs_manage_servers);
timer_setup(&net->fs_timer, afs_servers_timer, 0);
+ ret = -ENOMEM;
+ sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL);
+ if (!sysnames)
+ goto error_sysnames;
+ sysnames->subs[0] = (char *)&afs_init_sysname;
+ sysnames->nr = 1;
+ refcount_set(&sysnames->usage, 1);
+ net->sysnames = sysnames;
+ rwlock_init(&net->sysnames_lock);
+
/* Register the /proc stuff */
ret = afs_proc_init(net);
if (ret < 0)
@@ -92,6 +133,8 @@ error_cell_init:
net->live = false;
afs_proc_cleanup(net);
error_proc:
+ afs_put_sysnames(net->sysnames);
+error_sysnames:
net->live = false;
return ret;
}
@@ -106,6 +149,7 @@ static void __net_exit afs_net_exit(struct afs_net *net)
afs_purge_servers(net);
afs_close_socket(net);
afs_proc_cleanup(net);
+ afs_put_sysnames(net->sysnames);
}
/*
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 4508dd54f789..3aad32762989 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -62,7 +62,6 @@ static const struct file_operations afs_proc_rootcell_fops = {
.llseek = no_llseek,
};
-static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
loff_t *pos);
@@ -76,15 +75,6 @@ static const struct seq_operations afs_proc_cell_volumes_ops = {
.show = afs_proc_cell_volumes_show,
};
-static const struct file_operations afs_proc_cell_volumes_fops = {
- .open = afs_proc_cell_volumes_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int afs_proc_cell_vlservers_open(struct inode *inode,
- struct file *file);
static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
loff_t *pos);
@@ -98,14 +88,6 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = {
.show = afs_proc_cell_vlservers_show,
};
-static const struct file_operations afs_proc_cell_vlservers_fops = {
- .open = afs_proc_cell_vlservers_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int afs_proc_servers_open(struct inode *inode, struct file *file);
static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos);
static void *afs_proc_servers_next(struct seq_file *p, void *v,
loff_t *pos);
@@ -119,13 +101,34 @@ static const struct seq_operations afs_proc_servers_ops = {
.show = afs_proc_servers_show,
};
-static const struct file_operations afs_proc_servers_fops = {
- .open = afs_proc_servers_open,
+static int afs_proc_sysname_open(struct inode *inode, struct file *file);
+static int afs_proc_sysname_release(struct inode *inode, struct file *file);
+static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_sysname_next(struct seq_file *p, void *v,
+ loff_t *pos);
+static void afs_proc_sysname_stop(struct seq_file *p, void *v);
+static int afs_proc_sysname_show(struct seq_file *m, void *v);
+static ssize_t afs_proc_sysname_write(struct file *file,
+ const char __user *buf,
+ size_t size, loff_t *_pos);
+
+static const struct seq_operations afs_proc_sysname_ops = {
+ .start = afs_proc_sysname_start,
+ .next = afs_proc_sysname_next,
+ .stop = afs_proc_sysname_stop,
+ .show = afs_proc_sysname_show,
+};
+
+static const struct file_operations afs_proc_sysname_fops = {
+ .open = afs_proc_sysname_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = afs_proc_sysname_release,
+ .write = afs_proc_sysname_write,
};
+static int afs_proc_stats_show(struct seq_file *m, void *v);
+
/*
* initialise the /proc/fs/afs/ directory
*/
@@ -139,7 +142,9 @@ int afs_proc_init(struct afs_net *net)
if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) ||
!proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) ||
- !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops))
+ !proc_create_seq("servers", 0644, net->proc_afs, &afs_proc_servers_ops) ||
+ !proc_create_single("stats", 0644, net->proc_afs, afs_proc_stats_show) ||
+ !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops))
goto error_tree;
_leave(" = 0");
@@ -166,16 +171,7 @@ void afs_proc_cleanup(struct afs_net *net)
*/
static int afs_proc_cells_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
-
- ret = seq_open(file, &afs_proc_cells_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
- m->private = PDE_DATA(inode);
- return 0;
+ return seq_open(file, &afs_proc_cells_ops);
}
/*
@@ -183,6 +179,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
* first item
*/
static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
{
struct afs_net *net = afs_seq2net(m);
@@ -204,6 +201,7 @@ static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos)
* clean up after reading from the cells list
*/
static void afs_proc_cells_stop(struct seq_file *m, void *v)
+ __releases(rcu)
{
rcu_read_unlock();
}
@@ -282,7 +280,8 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
goto done;
}
- set_bit(AFS_CELL_FL_NO_GC, &cell->flags);
+ if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+ afs_put_cell(net, cell);
printk("kAFS: Added new cell '%s'\n", name);
} else {
goto inval;
@@ -304,7 +303,40 @@ inval:
static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
size_t size, loff_t *_pos)
{
- return 0;
+ struct afs_cell *cell;
+ struct afs_net *net = afs_proc2net(file);
+ unsigned int seq = 0;
+ char name[AFS_MAXCELLNAME + 1];
+ int len;
+
+ if (*_pos > 0)
+ return 0;
+ if (!net->ws_cell)
+ return 0;
+
+ rcu_read_lock();
+ do {
+ read_seqbegin_or_lock(&net->cells_lock, &seq);
+ len = 0;
+ cell = rcu_dereference_raw(net->ws_cell);
+ if (cell) {
+ len = cell->name_len;
+ memcpy(name, cell->name, len);
+ }
+ } while (need_seqretry(&net->cells_lock, seq));
+ done_seqretry(&net->cells_lock, seq);
+ rcu_read_unlock();
+
+ if (!len)
+ return 0;
+
+ name[len++] = '\n';
+ if (len > size)
+ len = size;
+ if (copy_to_user(buf, name, len) != 0)
+ return -EFAULT;
+ *_pos = 1;
+ return len;
}
/*
@@ -327,6 +359,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
+ ret = -EINVAL;
+ if (kbuf[0] == '.')
+ goto out;
+ if (memchr(kbuf, '/', size))
+ goto out;
+
/* trim to first NL */
s = memchr(kbuf, '\n', size);
if (s)
@@ -339,6 +377,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (ret >= 0)
ret = size; /* consume everything, always */
+out:
kfree(kbuf);
_leave(" = %d", ret);
return ret;
@@ -357,10 +396,11 @@ int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell)
if (!dir)
goto error_dir;
- if (!proc_create_data("vlservers", 0, dir,
- &afs_proc_cell_vlservers_fops, cell) ||
- !proc_create_data("volumes", 0, dir,
- &afs_proc_cell_volumes_fops, cell))
+ if (!proc_create_seq_data("vlservers", 0, dir,
+ &afs_proc_cell_vlservers_ops, cell))
+ goto error_tree;
+ if (!proc_create_seq_data("volumes", 0, dir, &afs_proc_cell_volumes_ops,
+ cell))
goto error_tree;
_leave(" = 0");
@@ -386,35 +426,13 @@ void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell)
}
/*
- * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
- */
-static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
-{
- struct afs_cell *cell;
- struct seq_file *m;
- int ret;
-
- cell = PDE_DATA(inode);
- if (!cell)
- return -ENOENT;
-
- ret = seq_open(file, &afs_proc_cell_volumes_ops);
- if (ret < 0)
- return ret;
-
- m = file->private_data;
- m->private = cell;
-
- return 0;
-}
-
-/*
* set up the iterator to start reading from the cells list and return the
* first item
*/
static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
+ __acquires(cell->proc_lock)
{
- struct afs_cell *cell = m->private;
+ struct afs_cell *cell = PDE_DATA(file_inode(m->file));
_enter("cell=%p pos=%Ld", cell, *_pos);
@@ -428,7 +446,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
loff_t *_pos)
{
- struct afs_cell *cell = p->private;
+ struct afs_cell *cell = PDE_DATA(file_inode(p->file));
_enter("cell=%p pos=%Ld", cell, *_pos);
return seq_list_next(v, &cell->proc_volumes, _pos);
@@ -438,8 +456,9 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
* clean up after reading from the cells list
*/
static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
+ __releases(cell->proc_lock)
{
- struct afs_cell *cell = p->private;
+ struct afs_cell *cell = PDE_DATA(file_inode(p->file));
read_unlock(&cell->proc_lock);
}
@@ -455,7 +474,7 @@ static const char afs_vol_types[3][3] = {
*/
static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
{
- struct afs_cell *cell = m->private;
+ struct afs_cell *cell = PDE_DATA(file_inode(m->file));
struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link);
/* Display header on line 1 */
@@ -472,37 +491,14 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
}
/*
- * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
- * location server
- */
-static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
-{
- struct afs_cell *cell;
- struct seq_file *m;
- int ret;
-
- cell = PDE_DATA(inode);
- if (!cell)
- return -ENOENT;
-
- ret = seq_open(file, &afs_proc_cell_vlservers_ops);
- if (ret<0)
- return ret;
-
- m = file->private_data;
- m->private = cell;
-
- return 0;
-}
-
-/*
* set up the iterator to start reading from the cells list and return the
* first item
*/
static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
{
struct afs_addr_list *alist;
- struct afs_cell *cell = m->private;
+ struct afs_cell *cell = PDE_DATA(file_inode(m->file));
loff_t pos = *_pos;
rcu_read_lock();
@@ -527,7 +523,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
loff_t *_pos)
{
struct afs_addr_list *alist;
- struct afs_cell *cell = p->private;
+ struct afs_cell *cell = PDE_DATA(file_inode(p->file));
loff_t pos;
alist = rcu_dereference(cell->vl_addrs);
@@ -544,6 +540,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
* clean up after reading from the cells list
*/
static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
+ __releases(rcu)
{
rcu_read_unlock();
}
@@ -567,19 +564,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
}
/*
- * open "/proc/fs/afs/servers" which provides a summary of active
- * servers
- */
-static int afs_proc_servers_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &afs_proc_servers_ops);
-}
-
-/*
* Set up the iterator to start reading from the server list and return the
* first item.
*/
static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
{
struct afs_net *net = afs_seq2net(m);
@@ -601,6 +590,7 @@ static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos)
* clean up after reading from the cells list
*/
static void afs_proc_servers_stop(struct seq_file *p, void *v)
+ __releases(rcu)
{
rcu_read_unlock();
}
@@ -626,3 +616,229 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
&alist->addrs[alist->index].transport);
return 0;
}
+
+void afs_put_sysnames(struct afs_sysnames *sysnames)
+{
+ int i;
+
+ if (sysnames && refcount_dec_and_test(&sysnames->usage)) {
+ for (i = 0; i < sysnames->nr; i++)
+ if (sysnames->subs[i] != afs_init_sysname &&
+ sysnames->subs[i] != sysnames->blank)
+ kfree(sysnames->subs[i]);
+ }
+}
+
+/*
+ * Handle opening of /proc/fs/afs/sysname. If it is opened for writing, we
+ * assume the caller wants to change the substitution list and we allocate a
+ * buffer to hold the list.
+ */
+static int afs_proc_sysname_open(struct inode *inode, struct file *file)
+{
+ struct afs_sysnames *sysnames;
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &afs_proc_sysname_ops);
+ if (ret < 0)
+ return ret;
+
+ if (file->f_mode & FMODE_WRITE) {
+ sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL);
+ if (!sysnames) {
+ seq_release(inode, file);
+ return -ENOMEM;
+ }
+
+ refcount_set(&sysnames->usage, 1);
+ m = file->private_data;
+ m->private = sysnames;
+ }
+
+ return 0;
+}
+
+/*
+ * Handle writes to /proc/fs/afs/sysname to set the @sys substitution.
+ */
+static ssize_t afs_proc_sysname_write(struct file *file,
+ const char __user *buf,
+ size_t size, loff_t *_pos)
+{
+ struct afs_sysnames *sysnames;
+ struct seq_file *m = file->private_data;
+ char *kbuf = NULL, *s, *p, *sub;
+ int ret, len;
+
+ sysnames = m->private;
+ if (!sysnames)
+ return -EINVAL;
+ if (sysnames->error)
+ return sysnames->error;
+
+ if (size >= PAGE_SIZE - 1) {
+ sysnames->error = -EINVAL;
+ return -EINVAL;
+ }
+ if (size == 0)
+ return 0;
+
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ inode_lock(file_inode(file));
+
+ p = kbuf;
+ while ((s = strsep(&p, " \t\n"))) {
+ len = strlen(s);
+ if (len == 0)
+ continue;
+ ret = -ENAMETOOLONG;
+ if (len >= AFSNAMEMAX)
+ goto error;
+
+ if (len >= 4 &&
+ s[len - 4] == '@' &&
+ s[len - 3] == 's' &&
+ s[len - 2] == 'y' &&
+ s[len - 1] == 's')
+ /* Protect against recursion */
+ goto invalid;
+
+ if (s[0] == '.' &&
+ (len < 2 || (len == 2 && s[1] == '.')))
+ goto invalid;
+
+ if (memchr(s, '/', len))
+ goto invalid;
+
+ ret = -EFBIG;
+ if (sysnames->nr >= AFS_NR_SYSNAME)
+ goto out;
+
+ if (strcmp(s, afs_init_sysname) == 0) {
+ sub = (char *)afs_init_sysname;
+ } else {
+ ret = -ENOMEM;
+ sub = kmemdup(s, len + 1, GFP_KERNEL);
+ if (!sub)
+ goto out;
+ }
+
+ sysnames->subs[sysnames->nr] = sub;
+ sysnames->nr++;
+ }
+
+ ret = size; /* consume everything, always */
+out:
+ inode_unlock(file_inode(file));
+ kfree(kbuf);
+ return ret;
+
+invalid:
+ ret = -EINVAL;
+error:
+ sysnames->error = ret;
+ goto out;
+}
+
+static int afs_proc_sysname_release(struct inode *inode, struct file *file)
+{
+ struct afs_sysnames *sysnames, *kill = NULL;
+ struct seq_file *m = file->private_data;
+ struct afs_net *net = afs_seq2net(m);
+
+ sysnames = m->private;
+ if (sysnames) {
+ if (!sysnames->error) {
+ kill = sysnames;
+ if (sysnames->nr == 0) {
+ sysnames->subs[0] = sysnames->blank;
+ sysnames->nr++;
+ }
+ write_lock(&net->sysnames_lock);
+ kill = net->sysnames;
+ net->sysnames = sysnames;
+ write_unlock(&net->sysnames_lock);
+ }
+ afs_put_sysnames(kill);
+ }
+
+ return seq_release(inode, file);
+}
+
+static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos)
+ __acquires(&net->sysnames_lock)
+{
+ struct afs_net *net = afs_seq2net(m);
+ struct afs_sysnames *names = net->sysnames;
+
+ read_lock(&net->sysnames_lock);
+
+ if (*pos >= names->nr)
+ return NULL;
+ return (void *)(unsigned long)(*pos + 1);
+}
+
+static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct afs_net *net = afs_seq2net(m);
+ struct afs_sysnames *names = net->sysnames;
+
+ *pos += 1;
+ if (*pos >= names->nr)
+ return NULL;
+ return (void *)(unsigned long)(*pos + 1);
+}
+
+static void afs_proc_sysname_stop(struct seq_file *m, void *v)
+ __releases(&net->sysnames_lock)
+{
+ struct afs_net *net = afs_seq2net(m);
+
+ read_unlock(&net->sysnames_lock);
+}
+
+static int afs_proc_sysname_show(struct seq_file *m, void *v)
+{
+ struct afs_net *net = afs_seq2net(m);
+ struct afs_sysnames *sysnames = net->sysnames;
+ unsigned int i = (unsigned long)v - 1;
+
+ if (i < sysnames->nr)
+ seq_printf(m, "%s\n", sysnames->subs[i]);
+ return 0;
+}
+
+/*
+ * Display general per-net namespace statistics
+ */
+static int afs_proc_stats_show(struct seq_file *m, void *v)
+{
+ struct afs_net *net = afs_seq2net(m);
+
+ seq_puts(m, "kAFS statistics\n");
+
+ seq_printf(m, "dir-mgmt: look=%u reval=%u inval=%u relpg=%u\n",
+ atomic_read(&net->n_lookup),
+ atomic_read(&net->n_reval),
+ atomic_read(&net->n_inval),
+ atomic_read(&net->n_relpg));
+
+ seq_printf(m, "dir-data: rdpg=%u\n",
+ atomic_read(&net->n_read_dir));
+
+ seq_printf(m, "dir-edit: cr=%u rm=%u\n",
+ atomic_read(&net->n_dir_cr),
+ atomic_read(&net->n_dir_rm));
+
+ seq_printf(m, "file-rd : n=%u nb=%lu\n",
+ atomic_read(&net->n_fetches),
+ atomic_long_read(&net->n_fetch_bytes));
+ seq_printf(m, "file-wr : n=%u nb=%lu\n",
+ atomic_read(&net->n_stores),
+ atomic_long_read(&net->n_store_bytes));
+ return 0;
+}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ad1328d85526..e065bc0768e6 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -21,7 +21,7 @@
/*
* Initialise a filesystem server cursor for iterating over FS servers.
*/
-void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
{
memset(fc, 0, sizeof(*fc));
}
@@ -179,7 +179,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
*/
if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
fc->ac.error = -EREMOTEIO;
- goto failed;
+ goto next_server;
}
write_lock(&vnode->volume->servers_lock);
@@ -201,7 +201,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
*/
if (vnode->volume->servers == fc->server_list) {
fc->ac.error = -EREMOTEIO;
- goto failed;
+ goto next_server;
}
/* Try again */
@@ -350,8 +350,8 @@ use_server:
* break request before we've finished decoding the reply and
* installing the vnode.
*/
- fc->ac.error = afs_register_server_cb_interest(
- vnode, &fc->server_list->servers[fc->index]);
+ fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list,
+ fc->index);
if (fc->ac.error < 0)
goto failed;
@@ -369,8 +369,16 @@ use_server:
if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
fc->ac.alist = afs_get_addrlist(alist);
- if (!afs_probe_fileserver(fc))
- goto failed;
+ if (!afs_probe_fileserver(fc)) {
+ switch (fc->ac.error) {
+ case -ENOMEM:
+ case -ERESTARTSYS:
+ case -EINTR:
+ goto failed;
+ default:
+ goto next_server;
+ }
+ }
}
if (!fc->ac.alist)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index f7ae54b6a393..08735948f15d 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -41,6 +41,7 @@ int afs_open_socket(struct afs_net *net)
{
struct sockaddr_rxrpc srx;
struct socket *socket;
+ unsigned int min_level;
int ret;
_enter("");
@@ -60,6 +61,12 @@ int afs_open_socket(struct afs_net *net)
srx.transport.sin6.sin6_family = AF_INET6;
srx.transport.sin6.sin6_port = htons(AFS_CM_PORT);
+ min_level = RXRPC_SECURITY_ENCRYPT;
+ ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL,
+ (void *)&min_level, sizeof(min_level));
+ if (ret < 0)
+ goto error_2;
+
ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
if (ret == -EADDRINUSE) {
srx.transport.sin6.sin6_port = 0;
@@ -482,8 +489,12 @@ static void afs_deliver_to_call(struct afs_call *call)
state = READ_ONCE(call->state);
switch (ret) {
case 0:
- if (state == AFS_CALL_CL_PROC_REPLY)
+ if (state == AFS_CALL_CL_PROC_REPLY) {
+ if (call->cbi)
+ set_bit(AFS_SERVER_FL_MAY_HAVE_CB,
+ &call->cbi->server->flags);
goto call_complete;
+ }
ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY);
goto done;
case -EINPROGRESS:
@@ -493,11 +504,6 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
goto done;
- case -ENOTCONN:
- abort_code = RX_CALL_DEAD;
- rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KNC");
- goto local_abort;
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
@@ -926,3 +932,12 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
afs_set_call_complete(call, ret, remote_abort);
return ret;
}
+
+/*
+ * Log protocol error production.
+ */
+noinline int afs_protocol_error(struct afs_call *call, int error)
+{
+ trace_afs_protocol_error(call, error, __builtin_return_address(0));
+ return error;
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index b88b7d45fdaa..81dfedb7879f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -147,8 +147,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
break;
}
- if (cb_break != (vnode->cb_break +
- vnode->cb_interest->server->cb_s_break)) {
+ if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) {
changed = true;
break;
}
@@ -178,18 +177,14 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
}
}
- if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) {
- rcu_read_unlock();
+ if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest))
goto someone_else_changed_it;
- }
/* We need a ref on any permits list we want to copy as we'll have to
* drop the lock to do memory allocation.
*/
- if (permits && !refcount_inc_not_zero(&permits->usage)) {
- rcu_read_unlock();
+ if (permits && !refcount_inc_not_zero(&permits->usage))
goto someone_else_changed_it;
- }
rcu_read_unlock();
@@ -261,7 +256,7 @@ found:
spin_lock(&vnode->lock);
zap = rcu_access_pointer(vnode->permit_cache);
- if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) &&
+ if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) &&
zap == permits)
rcu_assign_pointer(vnode->permit_cache, replacement);
else
@@ -278,6 +273,7 @@ someone_else_changed_it:
/* Someone else changed the cache under us - don't recheck at this
* time.
*/
+ rcu_read_unlock();
return;
}
@@ -296,8 +292,6 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
_enter("{%x:%u},%x",
vnode->fid.vid, vnode->fid.vnode, key_serial(key));
- permits = vnode->permit_cache;
-
/* check the permits to see if we've got one yet */
if (key == vnode->volume->cell->anonymous_key) {
_debug("anon");
@@ -327,7 +321,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
*/
_debug("no valid permit");
- ret = afs_fetch_status(vnode, key);
+ ret = afs_fetch_status(vnode, key, false);
if (ret < 0) {
*_access = 0;
_leave(" = %d", ret);
@@ -378,18 +372,14 @@ int afs_permission(struct inode *inode, int mask)
mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
if (S_ISDIR(inode->i_mode)) {
- if (mask & MAY_EXEC) {
- if (!(access & AFS_ACE_LOOKUP))
- goto permission_denied;
- } else if (mask & MAY_READ) {
+ if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) {
if (!(access & AFS_ACE_LOOKUP))
goto permission_denied;
- } else if (mask & MAY_WRITE) {
+ }
+ if (mask & MAY_WRITE) {
if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */
goto permission_denied;
- } else {
- BUG();
}
} else {
if (!(access & AFS_ACE_LOOKUP))
diff --git a/fs/afs/server.c b/fs/afs/server.c
index a43ef77dabae..3af4625e2f8c 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -59,19 +59,14 @@ struct afs_server *afs_find_server(struct afs_net *net,
alist = rcu_dereference(server->addresses);
for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
b = &alist->addrs[i].transport.sin6;
- diff = (u16)a->sin6_port - (u16)b->sin6_port;
+ diff = ((u16 __force)a->sin6_port -
+ (u16 __force)b->sin6_port);
if (diff == 0)
diff = memcmp(&a->sin6_addr,
&b->sin6_addr,
sizeof(struct in6_addr));
if (diff == 0)
goto found;
- if (diff < 0) {
- // TODO: Sort the list
- //if (i == alist->nr_ipv4)
- // goto not_found;
- break;
- }
}
}
} else {
@@ -79,23 +74,17 @@ struct afs_server *afs_find_server(struct afs_net *net,
alist = rcu_dereference(server->addresses);
for (i = 0; i < alist->nr_ipv4; i++) {
b = &alist->addrs[i].transport.sin6;
- diff = (u16)a->sin6_port - (u16)b->sin6_port;
+ diff = ((u16 __force)a->sin6_port -
+ (u16 __force)b->sin6_port);
if (diff == 0)
- diff = ((u32)a->sin6_addr.s6_addr32[3] -
- (u32)b->sin6_addr.s6_addr32[3]);
+ diff = ((u32 __force)a->sin6_addr.s6_addr32[3] -
+ (u32 __force)b->sin6_addr.s6_addr32[3]);
if (diff == 0)
goto found;
- if (diff < 0) {
- // TODO: Sort the list
- //if (i == 0)
- // goto not_found;
- break;
- }
}
}
}
- //not_found:
server = NULL;
found:
if (server && !atomic_inc_not_zero(&server->usage))
@@ -381,7 +370,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
{
struct afs_server *server = container_of(rcu, struct afs_server, rcu);
- afs_put_addrlist(server->addresses);
+ afs_put_addrlist(rcu_access_pointer(server->addresses));
kfree(server);
}
@@ -390,17 +379,19 @@ static void afs_server_rcu(struct rcu_head *rcu)
*/
static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
{
- struct afs_addr_list *alist = server->addresses;
+ struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
struct afs_addr_cursor ac = {
.alist = alist,
- .addr = &alist->addrs[0],
.start = alist->index,
- .index = alist->index,
+ .index = 0,
+ .addr = &alist->addrs[alist->index],
.error = 0,
};
_enter("%p", server);
- afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+ if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+ afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+
call_rcu(&server->rcu, afs_server_rcu);
afs_dec_servers_outstanding(net);
}
@@ -426,8 +417,15 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
}
write_sequnlock(&net->fs_lock);
- if (deleted)
+ if (deleted) {
+ write_seqlock(&net->fs_addr_lock);
+ if (!hlist_unhashed(&server->addr4_link))
+ hlist_del_rcu(&server->addr4_link);
+ if (!hlist_unhashed(&server->addr6_link))
+ hlist_del_rcu(&server->addr6_link);
+ write_sequnlock(&net->fs_addr_lock);
afs_destroy_server(net, server);
+ }
}
}
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 0f8dc4c8f07c..8a5760aa5832 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -49,6 +49,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
goto error;
refcount_set(&slist->usage, 1);
+ rwlock_init(&slist->lock);
/* Make sure a records exists for each server in the list. */
for (i = 0; i < vldb->nr_servers; i++) {
@@ -64,9 +65,11 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
goto error_2;
}
- /* Insertion-sort by server pointer */
+ /* Insertion-sort by UUID */
for (j = 0; j < slist->nr_servers; j++)
- if (slist->servers[j].server >= server)
+ if (memcmp(&slist->servers[j].server->uuid,
+ &server->uuid,
+ sizeof(server->uuid)) >= 0)
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 3623c952b6ff..9e5d7966621c 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -154,7 +154,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root)
seq_puts(m, "none");
return 0;
}
-
+
switch (volume->type) {
case AFSVL_RWVOL:
break;
@@ -269,7 +269,7 @@ static int afs_parse_device_name(struct afs_mount_params *params,
int cellnamesz;
_enter(",%s", name);
-
+
if (!name) {
printk(KERN_ERR "kAFS: no volume name specified\n");
return -EINVAL;
@@ -418,7 +418,10 @@ static int afs_fill_super(struct super_block *sb,
if (!sb->s_root)
goto error;
- sb->s_d_op = &afs_fs_dentry_operations;
+ if (params->dyn_root)
+ sb->s_d_op = &afs_dynroot_dentry_operations;
+ else
+ sb->s_d_op = &afs_fs_dentry_operations;
_leave(" = 0");
return 0;
@@ -587,7 +590,7 @@ static void afs_i_init_once(void *_vnode)
memset(vnode, 0, sizeof(*vnode));
inode_init_once(&vnode->vfs_inode);
mutex_init(&vnode->io_lock);
- mutex_init(&vnode->validate_lock);
+ init_rwsem(&vnode->validate_lock);
spin_lock_init(&vnode->wb_lock);
spin_lock_init(&vnode->lock);
INIT_LIST_HEAD(&vnode->wb_keys);
@@ -676,7 +679,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = 0;
return 0;
}
-
+
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
return PTR_ERR(key);
@@ -685,7 +688,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
if (afs_begin_vnode_operation(&fc, vnode, key)) {
fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
while (afs_select_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_get_volume_status(&fc, &vs);
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 5d8562f1ad4a..c3b740813fc7 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -23,7 +23,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
struct afs_uvldbentry__xdr *uvldb;
struct afs_vldb_entry *entry;
bool new_only = false;
- u32 tmp, nr_servers;
+ u32 tmp, nr_servers, vlflags;
int i, ret;
_enter("");
@@ -55,6 +55,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
new_only = true;
}
+ vlflags = ntohl(uvldb->flags);
for (i = 0; i < nr_servers; i++) {
struct afs_uuid__xdr *xdr;
struct afs_uuid *uuid;
@@ -64,12 +65,13 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
if (tmp & AFS_VLSF_DONTUSE ||
(new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
continue;
- if (tmp & AFS_VLSF_RWVOL)
+ if (tmp & AFS_VLSF_RWVOL) {
entry->fs_mask[i] |= AFS_VOL_VTM_RW;
+ if (vlflags & AFS_VLF_BACKEXISTS)
+ entry->fs_mask[i] |= AFS_VOL_VTM_BAK;
+ }
if (tmp & AFS_VLSF_ROVOL)
entry->fs_mask[i] |= AFS_VOL_VTM_RO;
- if (tmp & AFS_VLSF_BACKVOL)
- entry->fs_mask[i] |= AFS_VOL_VTM_BAK;
if (!entry->fs_mask[i])
continue;
@@ -89,15 +91,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
for (i = 0; i < AFS_MAXTYPES; i++)
entry->vid[i] = ntohl(uvldb->volumeId[i]);
- tmp = ntohl(uvldb->flags);
- if (tmp & AFS_VLF_RWEXISTS)
+ if (vlflags & AFS_VLF_RWEXISTS)
__set_bit(AFS_VLDB_HAS_RW, &entry->flags);
- if (tmp & AFS_VLF_ROEXISTS)
+ if (vlflags & AFS_VLF_ROEXISTS)
__set_bit(AFS_VLDB_HAS_RO, &entry->flags);
- if (tmp & AFS_VLF_BACKEXISTS)
+ if (vlflags & AFS_VLF_BACKEXISTS)
__set_bit(AFS_VLDB_HAS_BAK, &entry->flags);
- if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) {
+ if (!(vlflags & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) {
entry->error = -ENOMEDIUM;
__set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags);
}
@@ -303,7 +304,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
r->uuid.clock_seq_hi_and_reserved = htonl(u->clock_seq_hi_and_reserved);
r->uuid.clock_seq_low = htonl(u->clock_seq_low);
for (i = 0; i < 6; i++)
- r->uuid.node[i] = ntohl(u->node[i]);
+ r->uuid.node[i] = htonl(u->node[i]);
trace_afs_make_vl_call(call);
return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
@@ -450,7 +451,7 @@ again:
call->count2 = ntohl(*bp); /* Type or next count */
if (call->count > YFS_MAXENDPOINTS)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
if (!alist)
@@ -474,7 +475,7 @@ again:
size = sizeof(__be32) * (1 + 4 + 1);
break;
default:
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
}
size += sizeof(__be32);
@@ -487,24 +488,24 @@ again:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
if (ntohl(bp[0]) != sizeof(__be32) * 2)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
bp += 6;
break;
default:
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
}
/* Got either the type of the next entry or the count of
* volEndpoints if no more fsEndpoints.
*/
- call->count2 = htonl(*bp++);
+ call->count2 = ntohl(*bp++);
call->offset = 0;
call->count--;
@@ -517,7 +518,7 @@ again:
if (!call->count)
goto end;
if (call->count > YFS_MAXENDPOINTS)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
call->unmarshall = 3;
@@ -531,7 +532,7 @@ again:
return ret;
bp = call->buffer;
- call->count2 = htonl(*bp++);
+ call->count2 = ntohl(*bp++);
call->offset = 0;
call->unmarshall = 4;
@@ -545,7 +546,7 @@ again:
size = sizeof(__be32) * (1 + 4 + 1);
break;
default:
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
}
if (call->count > 1)
@@ -558,16 +559,16 @@ again:
switch (call->count2) {
case YFS_ENDPOINT_IPV4:
if (ntohl(bp[0]) != sizeof(__be32) * 2)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
bp += 6;
break;
default:
- return -EBADMSG;
+ return afs_protocol_error(call, -EBADMSG);
}
/* Got either the type of the next entry or the count of
@@ -576,7 +577,7 @@ again:
call->offset = 0;
call->count--;
if (call->count > 0) {
- call->count2 = htonl(*bp++);
+ call->count2 = ntohl(*bp++);
goto again;
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index b517a588781f..3037bd01f617 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -225,7 +225,9 @@ void afs_activate_volume(struct afs_volume *volume)
#ifdef CONFIG_AFS_FSCACHE
volume->cache = fscache_acquire_cookie(volume->cell->cache,
&afs_volume_cache_index_def,
- volume, true);
+ &volume->vid, sizeof(volume->vid),
+ NULL, 0,
+ volume, 0, true);
#endif
write_lock(&volume->cell->proc_lock);
@@ -245,7 +247,7 @@ void afs_deactivate_volume(struct afs_volume *volume)
write_unlock(&volume->cell->proc_lock);
#ifdef CONFIG_AFS_FSCACHE
- fscache_relinquish_cookie(volume->cache,
+ fscache_relinquish_cookie(volume->cache, NULL,
test_bit(AFS_VOLUME_DELETED, &volume->flags));
volume->cache = NULL;
#endif
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9370e2feb999..8b39e6ebb40b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -42,10 +42,11 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
if (!req)
return -ENOMEM;
- atomic_set(&req->usage, 1);
+ refcount_set(&req->usage, 1);
req->pos = pos;
req->len = len;
req->nr_pages = 1;
+ req->pages = req->array;
req->pages[0] = page;
get_page(page);
@@ -124,7 +125,12 @@ try_again:
page->index, priv);
goto flush_conflicting_write;
}
- if (to < f || from > t)
+ /* If the file is being filled locally, allow inter-write
+ * spaces to be merged into writes. If it's not, only write
+ * back what the user gives us.
+ */
+ if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&
+ (to < f || from > t))
goto flush_conflicting_write;
if (from < f)
f = from;
@@ -345,7 +351,7 @@ found_key:
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
while (afs_select_fileserver(&fc)) {
- fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+ fc.cb_break = afs_calc_vnode_cb_break(vnode);
afs_fs_store_data(&fc, mapping, first, last, offset, to);
}
@@ -355,6 +361,12 @@ found_key:
}
switch (ret) {
+ case 0:
+ afs_stat_v(vnode, n_stores);
+ atomic_long_add((last * PAGE_SIZE + to) -
+ (first * PAGE_SIZE + offset),
+ &afs_v2net(vnode)->n_store_bytes);
+ break;
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -412,7 +424,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
trace_afs_page_dirty(vnode, tracepoint_string("WARN"),
primary_page->index, priv);
- if (start >= final_page || to < PAGE_SIZE)
+ if (start >= final_page ||
+ (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)))
goto no_more;
start++;
@@ -433,9 +446,10 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
}
for (loop = 0; loop < n; loop++) {
- if (to != PAGE_SIZE)
- break;
page = pages[loop];
+ if (to != PAGE_SIZE &&
+ !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))
+ break;
if (page->index > final_page)
break;
if (!trylock_page(page))
@@ -448,7 +462,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
priv = page_private(page);
f = priv & AFS_PRIV_MAX;
t = priv >> AFS_PRIV_SHIFT;
- if (f != 0) {
+ if (f != 0 &&
+ !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) {
unlock_page(page);
break;
}
@@ -570,10 +585,11 @@ static int afs_writepages_region(struct address_space *mapping,
_debug("wback %lx", page->index);
- /* at this point we hold neither mapping->tree_lock nor lock on
- * the page itself: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled back from
- * swapper_space to tmpfs file mapping
+ /*
+ * at this point we hold neither the i_pages lock nor the
+ * page lock: the page may be truncated or invalidated
+ * (changing page->mapping to NULL), or even swizzled
+ * back from swapper_space to tmpfs file mapping
*/
ret = lock_page_killable(page);
if (ret < 0) {
@@ -734,20 +750,6 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
/*
- * Flush out all outstanding writes on a file opened for writing when it is
- * closed.
- */
-int afs_flush(struct file *file, fl_owner_t id)
-{
- _enter("");
-
- if ((file->f_mode & FMODE_WRITE) == 0)
- return 0;
-
- return vfs_fsync(file, 0);
-}
-
-/*
* notification that a previously read-only page is about to become writable
* - if it returns an error, the caller will deliver a bus error signal
*/
diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h
new file mode 100644
index 000000000000..aa21f3068d52
--- /dev/null
+++ b/fs/afs/xdr_fs.h
@@ -0,0 +1,103 @@
+/* AFS fileserver XDR types
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef XDR_FS_H
+#define XDR_FS_H
+
+struct afs_xdr_AFSFetchStatus {
+ __be32 if_version;
+#define AFS_FSTATUS_VERSION 1
+ __be32 type;
+ __be32 nlink;
+ __be32 size_lo;
+ __be32 data_version_lo;
+ __be32 author;
+ __be32 owner;
+ __be32 caller_access;
+ __be32 anon_access;
+ __be32 mode;
+ __be32 parent_vnode;
+ __be32 parent_unique;
+ __be32 seg_size;
+ __be32 mtime_client;
+ __be32 mtime_server;
+ __be32 group;
+ __be32 sync_counter;
+ __be32 data_version_hi;
+ __be32 lock_count;
+ __be32 size_hi;
+ __be32 abort_code;
+} __packed;
+
+#define AFS_DIR_HASHTBL_SIZE 128
+#define AFS_DIR_DIRENT_SIZE 32
+#define AFS_DIR_SLOTS_PER_BLOCK 64
+#define AFS_DIR_BLOCK_SIZE 2048
+#define AFS_DIR_BLOCKS_PER_PAGE (PAGE_SIZE / AFS_DIR_BLOCK_SIZE)
+#define AFS_DIR_MAX_SLOTS 65536
+#define AFS_DIR_BLOCKS_WITH_CTR 128
+#define AFS_DIR_MAX_BLOCKS 1023
+#define AFS_DIR_RESV_BLOCKS 1
+#define AFS_DIR_RESV_BLOCKS0 13
+
+/*
+ * Directory entry structure.
+ */
+union afs_xdr_dirent {
+ struct {
+ u8 valid;
+ u8 unused[1];
+ __be16 hash_next;
+ __be32 vnode;
+ __be32 unique;
+ u8 name[16];
+ u8 overflow[4]; /* if any char of the name (inc
+ * NUL) reaches here, consume
+ * the next dirent too */
+ } u;
+ u8 extended_name[32];
+} __packed;
+
+/*
+ * Directory block header (one at the beginning of every 2048-byte block).
+ */
+struct afs_xdr_dir_hdr {
+ __be16 npages;
+ __be16 magic;
+#define AFS_DIR_MAGIC htons(1234)
+ u8 reserved;
+ u8 bitmap[8];
+ u8 pad[19];
+} __packed;
+
+/*
+ * Directory block layout
+ */
+union afs_xdr_dir_block {
+ struct afs_xdr_dir_hdr hdr;
+
+ struct {
+ struct afs_xdr_dir_hdr hdr;
+ u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS];
+ __be16 hashtable[AFS_DIR_HASHTBL_SIZE];
+ } meta;
+
+ union afs_xdr_dirent dirents[AFS_DIR_SLOTS_PER_BLOCK];
+} __packed;
+
+/*
+ * Directory layout on a linux VM page.
+ */
+struct afs_xdr_dir_page {
+ union afs_xdr_dir_block blocks[AFS_DIR_BLOCKS_PER_PAGE];
+};
+
+#endif /* XDR_FS_H */
diff --git a/fs/aio.c b/fs/aio.c
index 88d7927ffbc6..b850e92ee0d5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright 2018 Christoph Hellwig.
*
* See ../COPYING for licensing terms.
*/
@@ -46,6 +47,8 @@
#include "internal.h"
+#define KIOCB_KEY 0
+
#define AIO_RING_MAGIC 0xa10a10a1
#define AIO_RING_COMPAT_FEATURES 1
#define AIO_RING_INCOMPAT_FEATURES 0
@@ -156,21 +159,29 @@ struct kioctx {
unsigned id;
};
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED ((void *) (~0ULL))
+struct fsync_iocb {
+ struct work_struct work;
+ struct file *file;
+ bool datasync;
+};
+
+struct poll_iocb {
+ struct file *file;
+ __poll_t events;
+ struct wait_queue_head *head;
+
+ union {
+ struct wait_queue_entry wait;
+ struct work_struct work;
+ };
+};
struct aio_kiocb {
- struct kiocb common;
+ union {
+ struct kiocb rw;
+ struct fsync_iocb fsync;
+ struct poll_iocb poll;
+ };
struct kioctx *ki_ctx;
kiocb_cancel_fn *ki_cancel;
@@ -264,9 +275,6 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-
- pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
-
return 0;
}
__initcall(aio_setup);
@@ -552,42 +560,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
+ struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
-
- if (!req->ki_list.next)
- list_add(&req->ki_list, &ctx->active_reqs);
+ if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
+ return;
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
-
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct aio_kiocb *kiocb)
-{
- kiocb_cancel_fn *old, *cancel;
-
- /*
- * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
- * actually has a cancel function, hence the cmpxchg()
- */
-
- cancel = READ_ONCE(kiocb->ki_cancel);
- do {
- if (!cancel || cancel == KIOCB_CANCELLED)
- return -EINVAL;
-
- old = cancel;
- cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
- } while (cancel != old);
-
- return cancel(&kiocb->common);
-}
-
/*
* free_ioctx() should be RCU delayed to synchronize against the RCU
* protected lookup_ioctx() and also needs process context to call
@@ -634,9 +620,8 @@ static void free_ioctx_users(struct percpu_ref *ref)
while (!list_empty(&ctx->active_reqs)) {
req = list_first_entry(&ctx->active_reqs,
struct aio_kiocb, ki_list);
-
+ req->ki_cancel(&req->rw);
list_del_init(&req->ki_list);
- kiocb_cancel(req);
}
spin_unlock_irq(&ctx->ctx_lock);
@@ -1042,7 +1027,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
goto out_put;
percpu_ref_get(&ctx->reqs);
-
+ INIT_LIST_HEAD(&req->ki_list);
req->ki_ctx = ctx;
return req;
out_put:
@@ -1050,15 +1035,6 @@ out_put:
return NULL;
}
-static void kiocb_free(struct aio_kiocb *req)
-{
- if (req->common.ki_filp)
- fput(req->common.ki_filp);
- if (req->ki_eventfd != NULL)
- eventfd_ctx_put(req->ki_eventfd);
- kmem_cache_free(kiocb_cachep, req);
-}
-
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
struct aio_ring __user *ring = (void __user *)ctx_id;
@@ -1078,8 +1054,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
- percpu_ref_get(&ctx->users);
- ret = ctx;
+ if (percpu_ref_tryget_live(&ctx->users))
+ ret = ctx;
}
out:
rcu_read_unlock();
@@ -1089,44 +1065,14 @@ out:
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-static void aio_complete(struct kiocb *kiocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
{
- struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
unsigned tail, pos, head;
unsigned long flags;
- if (kiocb->ki_flags & IOCB_WRITE) {
- struct file *file = kiocb->ki_filp;
-
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (S_ISREG(file_inode(file)->i_mode))
- __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
- file_end_write(file);
- }
-
- /*
- * Special case handling for sync iocbs:
- * - events go directly into the iocb for fast handling
- * - the sync task with the iocb in its stack holds the single iocb
- * ref, no other paths have a way to get another ref
- * - the sync task helpfully left a reference to itself in the iocb
- */
- BUG_ON(is_sync_kiocb(kiocb));
-
- if (iocb->ki_list.next) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_del(&iocb->ki_list);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- }
-
/*
* Add a completion event to the ring buffer. Must be done holding
* ctx->completion_lock to prevent other code from messing with the tail
@@ -1180,11 +1126,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd != NULL)
+ if (iocb->ki_eventfd) {
eventfd_signal(iocb->ki_eventfd, 1);
+ eventfd_ctx_put(iocb->ki_eventfd);
+ }
- /* everything turned out well, dispose of the aiocb. */
- kiocb_free(iocb);
+ kmem_cache_free(kiocb_cachep, iocb);
/*
* We have to order our ring_info tail store above and test
@@ -1250,14 +1197,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
if (head == tail)
break;
- avail = min(avail, nr - ret);
- avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
- ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
-
pos = head + AIO_EVENTS_OFFSET;
page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
+ avail = min(avail, nr - ret);
+ avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
+
ev = kmap(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
@@ -1328,10 +1274,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
wait_event_interruptible_hrtimeout(ctx->wait,
aio_read_events(ctx, min_nr, nr, event, &ret),
until);
-
- if (!ret && signal_pending(current))
- ret = -EINTR;
-
return ret;
}
@@ -1447,6 +1389,58 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
return -EINVAL;
}
+static void aio_remove_iocb(struct aio_kiocb *iocb)
+{
+ struct kioctx *ctx = iocb->ki_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&iocb->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+
+static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+
+ if (!list_empty_careful(&iocb->ki_list))
+ aio_remove_iocb(iocb);
+
+ if (kiocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(kiocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (S_ISREG(inode->i_mode))
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ file_end_write(kiocb->ki_filp);
+ }
+
+ fput(kiocb->ki_filp);
+ aio_complete(iocb, res, res2);
+}
+
+static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+{
+ int ret;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp))
+ return -EBADF;
+ req->ki_complete = aio_complete_rw;
+ req->ki_pos = iocb->aio_offset;
+ req->ki_flags = iocb_flags(req->ki_filp);
+ if (iocb->aio_flags & IOCB_FLAG_RESFD)
+ req->ki_flags |= IOCB_EVENTFD;
+ req->ki_hint = file_write_hint(req->ki_filp);
+ ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+ if (unlikely(ret))
+ fput(req->ki_filp);
+ return ret;
+}
+
static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
{
@@ -1466,11 +1460,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
}
-static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
+static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
{
switch (ret) {
case -EIOCBQUEUED:
- return ret;
+ break;
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
@@ -1482,85 +1476,270 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
ret = -EINTR;
/*FALLTHRU*/
default:
- aio_complete(req, ret, 0);
- return 0;
+ aio_complete_rw(req, ret, 0);
}
}
static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
bool compat)
{
- struct file *file = req->ki_filp;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
+ struct file *file;
ssize_t ret;
+ ret = aio_prep_rw(req, iocb);
+ if (ret)
+ return ret;
+ file = req->ki_filp;
+
+ ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- return -EBADF;
+ goto out_fput;
+ ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- return -EINVAL;
+ goto out_fput;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- return ret;
+ goto out_fput;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
- ret = aio_ret(req, call_read_iter(file, req, &iter));
+ aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
+out_fput:
+ if (unlikely(ret))
+ fput(file);
return ret;
}
static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
bool compat)
{
- struct file *file = req->ki_filp;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
+ struct file *file;
ssize_t ret;
+ ret = aio_prep_rw(req, iocb);
+ if (ret)
+ return ret;
+ file = req->ki_filp;
+
+ ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- return -EBADF;
+ goto out_fput;
+ ret = -EINVAL;
if (unlikely(!file->f_op->write_iter))
- return -EINVAL;
+ goto out_fput;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- return ret;
+ goto out_fput;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
- req->ki_flags |= IOCB_WRITE;
- file_start_write(file);
- ret = aio_ret(req, call_write_iter(file, req, &iter));
/*
- * We release freeze protection in aio_complete(). Fool lockdep
- * by telling it the lock got released so that it doesn't
- * complain about held lock when we return to userspace.
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in
+ * aio_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
*/
- if (S_ISREG(file_inode(file)->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode)) {
+ __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ }
+ req->ki_flags |= IOCB_WRITE;
+ aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
+out_fput:
+ if (unlikely(ret))
+ fput(file);
return ret;
}
+static void aio_fsync_work(struct work_struct *work)
+{
+ struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
+ int ret;
+
+ ret = vfs_fsync(req->file, req->datasync);
+ fput(req->file);
+ aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+}
+
+static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+{
+ if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
+ iocb->aio_rw_flags))
+ return -EINVAL;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+ if (unlikely(!req->file->f_op->fsync)) {
+ fput(req->file);
+ return -EINVAL;
+ }
+
+ req->datasync = datasync;
+ INIT_WORK(&req->work, aio_fsync_work);
+ schedule_work(&req->work);
+ return 0;
+}
+
+/* need to use list_del_init so we can check if item was present */
+static inline bool __aio_poll_remove(struct poll_iocb *req)
+{
+ if (list_empty(&req->wait.entry))
+ return false;
+ list_del_init(&req->wait.entry);
+ return true;
+}
+
+static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
+{
+ fput(iocb->poll.file);
+ aio_complete(iocb, mangle_poll(mask), 0);
+}
+
+static void aio_poll_work(struct work_struct *work)
+{
+ struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work);
+
+ if (!list_empty_careful(&iocb->ki_list))
+ aio_remove_iocb(iocb);
+ __aio_poll_complete(iocb, iocb->poll.events);
+}
+
+static int aio_poll_cancel(struct kiocb *iocb)
+{
+ struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
+ struct poll_iocb *req = &aiocb->poll;
+ struct wait_queue_head *head = req->head;
+ bool found = false;
+
+ spin_lock(&head->lock);
+ found = __aio_poll_remove(req);
+ spin_unlock(&head->lock);
+
+ if (found) {
+ req->events = 0;
+ INIT_WORK(&req->work, aio_poll_work);
+ schedule_work(&req->work);
+ }
+ return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+ struct file *file = req->file;
+ __poll_t mask = key_to_poll(key);
+
+ assert_spin_locked(&req->head->lock);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & req->events))
+ return 0;
+
+ mask = file->f_op->poll_mask(file, req->events);
+ if (!mask)
+ return 0;
+
+ __aio_poll_remove(req);
+
+ /*
+ * Try completing without a context switch if we can acquire ctx_lock
+ * without spinning. Otherwise we need to defer to a workqueue to
+ * avoid a deadlock due to the lock order.
+ */
+ if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ list_del_init(&iocb->ki_list);
+ spin_unlock(&iocb->ki_ctx->ctx_lock);
+
+ __aio_poll_complete(iocb, mask);
+ } else {
+ req->events = mask;
+ INIT_WORK(&req->work, aio_poll_work);
+ schedule_work(&req->work);
+ }
+
+ return 1;
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+ struct kioctx *ctx = aiocb->ki_ctx;
+ struct poll_iocb *req = &aiocb->poll;
+ __poll_t mask;
+
+ /* reject any unknown events outside the normal event mask. */
+ if ((u16)iocb->aio_buf != iocb->aio_buf)
+ return -EINVAL;
+ /* reject fields that are not defined for poll */
+ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+ return -EINVAL;
+
+ req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+ if (!file_has_poll_mask(req->file))
+ goto out_fail;
+
+ req->head = req->file->f_op->get_poll_head(req->file, req->events);
+ if (!req->head)
+ goto out_fail;
+ if (IS_ERR(req->head)) {
+ mask = EPOLLERR;
+ goto done;
+ }
+
+ init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+ aiocb->ki_cancel = aio_poll_cancel;
+
+ spin_lock_irq(&ctx->ctx_lock);
+ spin_lock(&req->head->lock);
+ mask = req->file->f_op->poll_mask(req->file, req->events);
+ if (!mask) {
+ __add_wait_queue(req->head, &req->wait);
+ list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+ }
+ spin_unlock(&req->head->lock);
+ spin_unlock_irq(&ctx->ctx_lock);
+done:
+ if (mask)
+ __aio_poll_complete(aiocb, mask);
+ return 0;
+out_fail:
+ fput(req->file);
+ return -EINVAL; /* same as no support for IOCB_CMD_POLL */
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, bool compat)
+ bool compat)
{
struct aio_kiocb *req;
- struct file *file;
+ struct iocb iocb;
ssize_t ret;
+ if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+ return -EFAULT;
+
/* enforce forwards compatibility on users */
- if (unlikely(iocb->aio_reserved2)) {
+ if (unlikely(iocb.aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
/* prevent overflows */
if (unlikely(
- (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
- (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
- ((ssize_t)iocb->aio_nbytes < 0)
+ (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
+ (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
+ ((ssize_t)iocb.aio_nbytes < 0)
)) {
pr_debug("EINVAL: overflow check\n");
return -EINVAL;
@@ -1570,37 +1749,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!req))
return -EAGAIN;
- req->common.ki_filp = file = fget(iocb->aio_fildes);
- if (unlikely(!req->common.ki_filp)) {
- ret = -EBADF;
- goto out_put_req;
- }
- req->common.ki_pos = iocb->aio_offset;
- req->common.ki_complete = aio_complete;
- req->common.ki_flags = iocb_flags(req->common.ki_filp);
- req->common.ki_hint = file_write_hint(file);
-
- if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+ if (iocb.aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
* instance of the file* now. The file descriptor must be
* an eventfd() fd, and will be signaled for each completed
* event using the eventfd_signal() function.
*/
- req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
+ req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
if (IS_ERR(req->ki_eventfd)) {
ret = PTR_ERR(req->ki_eventfd);
req->ki_eventfd = NULL;
goto out_put_req;
}
-
- req->common.ki_flags |= IOCB_EVENTFD;
- }
-
- ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
- if (unlikely(ret)) {
- pr_debug("EINVAL: aio_rw_flags\n");
- goto out_put_req;
}
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1610,41 +1771,67 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
}
req->ki_user_iocb = user_iocb;
- req->ki_user_data = iocb->aio_data;
+ req->ki_user_data = iocb.aio_data;
- get_file(file);
- switch (iocb->aio_lio_opcode) {
+ switch (iocb.aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->common, iocb, false, compat);
+ ret = aio_read(&req->rw, &iocb, false, compat);
break;
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->common, iocb, false, compat);
+ ret = aio_write(&req->rw, &iocb, false, compat);
break;
case IOCB_CMD_PREADV:
- ret = aio_read(&req->common, iocb, true, compat);
+ ret = aio_read(&req->rw, &iocb, true, compat);
break;
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->common, iocb, true, compat);
+ ret = aio_write(&req->rw, &iocb, true, compat);
+ break;
+ case IOCB_CMD_FSYNC:
+ ret = aio_fsync(&req->fsync, &iocb, false);
+ break;
+ case IOCB_CMD_FDSYNC:
+ ret = aio_fsync(&req->fsync, &iocb, true);
+ break;
+ case IOCB_CMD_POLL:
+ ret = aio_poll(req, &iocb);
break;
default:
- pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
+ pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
ret = -EINVAL;
break;
}
- fput(file);
- if (ret && ret != -EIOCBQUEUED)
+ /*
+ * If ret is 0, we'd either done aio_complete() ourselves or have
+ * arranged for that to be done asynchronously. Anything non-zero
+ * means that we need to destroy req ourselves.
+ */
+ if (ret)
goto out_put_req;
return 0;
out_put_req:
put_reqs_available(ctx, 1);
percpu_ref_put(&ctx->reqs);
- kiocb_free(req);
+ if (req->ki_eventfd)
+ eventfd_ctx_put(req->ki_eventfd);
+ kmem_cache_free(kiocb_cachep, req);
return ret;
}
-static long do_io_submit(aio_context_t ctx_id, long nr,
- struct iocb __user *__user *iocbpp, bool compat)
+/* sys_io_submit:
+ * Queue the nr iocbs pointed to by iocbpp for processing. Returns
+ * the number of iocbs queued. May return -EINVAL if the aio_context
+ * specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ * *iocbpp[0] is not properly initialized, if the operation specified
+ * is invalid for the file descriptor in the iocb. May fail with
+ * -EFAULT if any of the data structures point to invalid data. May
+ * fail with -EBADF if the file descriptor specified in the first
+ * iocb is invalid. May fail with -EAGAIN if insufficient resources
+ * are available to queue any iocbs. Will return 0 if nr is 0. Will
+ * fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+ struct iocb __user * __user *, iocbpp)
{
struct kioctx *ctx;
long ret = 0;
@@ -1654,39 +1841,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr,
if (unlikely(nr < 0))
return -EINVAL;
- if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
- nr = LONG_MAX/sizeof(*iocbpp);
-
- if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
- return -EFAULT;
-
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
- blk_start_plug(&plug);
+ if (nr > ctx->nr_events)
+ nr = ctx->nr_events;
- /*
- * AKPM: should this return a partial result if some of the IOs were
- * successfully submitted?
- */
- for (i=0; i<nr; i++) {
+ blk_start_plug(&plug);
+ for (i = 0; i < nr; i++) {
struct iocb __user *user_iocb;
- struct iocb tmp;
- if (unlikely(__get_user(user_iocb, iocbpp + i))) {
+ if (unlikely(get_user(user_iocb, iocbpp + i))) {
ret = -EFAULT;
break;
}
- if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
- ret = -EFAULT;
- break;
- }
-
- ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+ ret = io_submit_one(ctx, user_iocb, false);
if (ret)
break;
}
@@ -1696,59 +1869,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr,
return i ? i : ret;
}
-/* sys_io_submit:
- * Queue the nr iocbs pointed to by iocbpp for processing. Returns
- * the number of iocbs queued. May return -EINVAL if the aio_context
- * specified by ctx_id is invalid, if nr is < 0, if the iocb at
- * *iocbpp[0] is not properly initialized, if the operation specified
- * is invalid for the file descriptor in the iocb. May fail with
- * -EFAULT if any of the data structures point to invalid data. May
- * fail with -EBADF if the file descriptor specified in the first
- * iocb is invalid. May fail with -EAGAIN if insufficient resources
- * are available to queue any iocbs. Will return 0 if nr is 0. Will
- * fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
- struct iocb __user * __user *, iocbpp)
-{
- return do_io_submit(ctx_id, nr, iocbpp, 0);
-}
-
#ifdef CONFIG_COMPAT
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+ int, nr, compat_uptr_t __user *, iocbpp)
{
- compat_uptr_t uptr;
- int i;
+ struct kioctx *ctx;
+ long ret = 0;
+ int i = 0;
+ struct blk_plug plug;
- for (i = 0; i < nr; ++i) {
- if (get_user(uptr, ptr32 + i))
- return -EFAULT;
- if (put_user(compat_ptr(uptr), ptr64 + i))
- return -EFAULT;
+ if (unlikely(nr < 0))
+ return -EINVAL;
+
+ ctx = lookup_ioctx(ctx_id);
+ if (unlikely(!ctx)) {
+ pr_debug("EINVAL: invalid context id\n");
+ return -EINVAL;
}
- return 0;
-}
-#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
+ if (nr > ctx->nr_events)
+ nr = ctx->nr_events;
-COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
- int, nr, u32 __user *, iocb)
-{
- struct iocb __user * __user *iocb64;
- long ret;
+ blk_start_plug(&plug);
+ for (i = 0; i < nr; i++) {
+ compat_uptr_t user_iocb;
- if (unlikely(nr < 0))
- return -EINVAL;
+ if (unlikely(get_user(user_iocb, iocbpp + i))) {
+ ret = -EFAULT;
+ break;
+ }
- if (nr > MAX_AIO_SUBMITS)
- nr = MAX_AIO_SUBMITS;
+ ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
+ if (ret)
+ break;
+ }
+ blk_finish_plug(&plug);
- iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
- ret = copy_iocb(nr, iocb, iocb64);
- if (!ret)
- ret = do_io_submit(ctx_id, nr, iocb64, 1);
- return ret;
+ percpu_ref_put(&ctx->users);
+ return i ? i : ret;
}
#endif
@@ -1756,15 +1914,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
* Finds a given iocb for cancellation.
*/
static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
{
struct aio_kiocb *kiocb;
assert_spin_locked(&ctx->ctx_lock);
- if (key != KIOCB_KEY)
- return NULL;
-
/* TODO: use a hash or array, this sucks. */
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_user_iocb == iocb)
@@ -1788,25 +1943,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
{
struct kioctx *ctx;
struct aio_kiocb *kiocb;
+ int ret = -EINVAL;
u32 key;
- int ret;
- ret = get_user(key, &iocb->aio_key);
- if (unlikely(ret))
+ if (unlikely(get_user(key, &iocb->aio_key)))
return -EFAULT;
+ if (unlikely(key != KIOCB_KEY))
+ return -EINVAL;
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx))
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
-
- kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb)
- ret = kiocb_cancel(kiocb);
- else
- ret = -EINVAL;
-
+ kiocb = lookup_kiocb(ctx, iocb);
+ if (kiocb) {
+ ret = kiocb->ki_cancel(&kiocb->rw);
+ list_del_init(&kiocb->ki_list);
+ }
spin_unlock_irq(&ctx->ctx_lock);
if (!ret) {
@@ -1861,13 +2015,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
struct timespec __user *, timeout)
{
struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+ return ret;
+}
+
+SYSCALL_DEFINE6(io_pgetevents,
+ aio_context_t, ctx_id,
+ long, min_nr,
+ long, nr,
+ struct io_event __user *, events,
+ struct timespec __user *, timeout,
+ const struct __aio_sigset __user *, usig)
+{
+ struct __aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
- if (timeout) {
- if (unlikely(get_timespec64(&ts, timeout)))
+ if (ksig.sigmask) {
+ if (ksig.sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ if (signal_pending(current)) {
+ if (ksig.sigmask) {
+ current->saved_sigmask = sigsaved;
+ set_restore_sigmask();
+ }
+
+ if (!ret)
+ ret = -ERESTARTNOHAND;
+ } else {
+ if (ksig.sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
}
- return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -1878,13 +2079,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
struct compat_timespec __user *, timeout)
{
struct timespec64 t;
+ int ret;
+
+ if (timeout && compat_get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+ return ret;
+}
+
+
+struct __compat_aio_sigset {
+ compat_sigset_t __user *sigmask;
+ compat_size_t sigsetsize;
+};
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents,
+ compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct compat_timespec __user *, timeout,
+ const struct __compat_aio_sigset __user *, usig)
+{
+ struct __compat_aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 t;
+ int ret;
+
+ if (timeout && compat_get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
- if (timeout) {
- if (compat_get_timespec64(&t, timeout))
+ if (ksig.sigmask) {
+ if (ksig.sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (get_compat_sigset(&ksigmask, ksig.sigmask))
return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ if (signal_pending(current)) {
+ if (ksig.sigmask) {
+ current->saved_sigmask = sigsaved;
+ set_restore_sigmask();
+ }
+ if (!ret)
+ ret = -ERESTARTNOHAND;
+ } else {
+ if (ksig.sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
}
- return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ return ret;
}
#endif
diff --git a/fs/attr.c b/fs/attr.c
index 12ffdb6fb63c..d0b4d34878fb 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,32 @@
#include <linux/evm.h>
#include <linux/ima.h>
+static bool chown_ok(const struct inode *inode, kuid_t uid)
+{
+ if (uid_eq(current_fsuid(), inode->i_uid) &&
+ uid_eq(uid, inode->i_uid))
+ return true;
+ if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ return true;
+ if (uid_eq(inode->i_uid, INVALID_UID) &&
+ ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
+ return true;
+ return false;
+}
+
+static bool chgrp_ok(const struct inode *inode, kgid_t gid)
+{
+ if (uid_eq(current_fsuid(), inode->i_uid) &&
+ (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
+ return true;
+ if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ return true;
+ if (gid_eq(inode->i_gid, INVALID_GID) &&
+ ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
+ return true;
+ return false;
+}
+
/**
* setattr_prepare - check if attribute changes to a dentry are allowed
* @dentry: dentry to check
@@ -52,17 +78,11 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
goto kill_priv;
/* Make sure a caller can chown. */
- if ((ia_valid & ATTR_UID) &&
- (!uid_eq(current_fsuid(), inode->i_uid) ||
- !uid_eq(attr->ia_uid, inode->i_uid)) &&
- !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
return -EPERM;
/* Make sure caller can chgrp. */
- if ((ia_valid & ATTR_GID) &&
- (!uid_eq(current_fsuid(), inode->i_uid) ||
- (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
- !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+ if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
return -EPERM;
/* Make sure a caller can chmod. */
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 82e8f6edfb48..b12e37f27530 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -749,7 +749,7 @@ static int autofs4_dir_mkdir(struct inode *dir,
autofs4_del_active(dentry);
- inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
+ inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode);
if (!inode)
return -ENOMEM;
d_add(dentry, inode);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index a0c57c37fa21..be9c3dc048ab 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -19,9 +19,6 @@
*/
static autofs_wqt_t autofs4_next_wait_queue = 1;
-/* These are the signals we allow interrupting a pending mount */
-#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT))
-
void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
{
struct autofs_wait_queue *wq, *nwq;
@@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
* wq->name.name is NULL iff the lock is already released
* or the mount has been made catatonic.
*/
- if (wq->name.name) {
- /* Block all but "shutdown" signals while waiting */
- unsigned long shutdown_sigs_mask;
- unsigned long irqflags;
- sigset_t oldset;
-
- spin_lock_irqsave(&current->sighand->siglock, irqflags);
- oldset = current->blocked;
- shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
- siginitsetinv(&current->blocked, shutdown_sigs_mask);
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
-
- wait_event_interruptible(wq->queue, wq->name.name == NULL);
-
- spin_lock_irqsave(&current->sighand->siglock, irqflags);
- current->blocked = oldset;
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
- } else {
- pr_debug("skipped sleeping\n");
- }
-
+ wait_event_killable(wq->queue, wq->name.name == NULL);
status = wq->status;
/*
@@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
kfree(wq->name.name);
wq->name.name = NULL; /* Do not wait on this queue */
wq->status = status;
- wake_up_interruptible(&wq->queue);
+ wake_up(&wq->queue);
if (!--wq->wait_ctr)
kfree(wq);
mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index af2832aaeec5..4700b4534439 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -198,23 +198,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
if (ret == BEFS_BT_NOT_FOUND) {
befs_debug(sb, "<--- %s %pd not found", __func__, dentry);
- d_add(dentry, NULL);
- return ERR_PTR(-ENOENT);
-
+ inode = NULL;
} else if (ret != BEFS_OK || offset == 0) {
befs_error(sb, "<--- %s Error", __func__);
- return ERR_PTR(-ENODATA);
+ inode = ERR_PTR(-ENODATA);
+ } else {
+ inode = befs_iget(dir->i_sb, (ino_t) offset);
}
-
- inode = befs_iget(dir->i_sb, (ino_t) offset);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- d_add(dentry, inode);
-
befs_debug(sb, "<--- %s", __func__);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index ee832ca5f734..f32f21c3bbc7 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -21,10 +21,9 @@
#define dprintf(x...)
#endif
-static int bfs_add_entry(struct inode *dir, const unsigned char *name,
- int namelen, int ino);
+static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino);
static struct buffer_head *bfs_find_entry(struct inode *dir,
- const unsigned char *name, int namelen,
+ const struct qstr *child,
struct bfs_dirent **res_dir);
static int bfs_readdir(struct file *f, struct dir_context *ctx)
@@ -111,8 +110,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
mark_inode_dirty(inode);
bfs_dump_imap("create", s);
- err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
- inode->i_ino);
+ err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino);
if (err) {
inode_dec_link_count(inode);
mutex_unlock(&info->bfs_lock);
@@ -136,19 +134,14 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
mutex_lock(&info->bfs_lock);
- bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+ bh = bfs_find_entry(dir, &dentry->d_name, &de);
if (bh) {
unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
brelse(bh);
inode = bfs_iget(dir->i_sb, ino);
- if (IS_ERR(inode)) {
- mutex_unlock(&info->bfs_lock);
- return ERR_CAST(inode);
- }
}
mutex_unlock(&info->bfs_lock);
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int bfs_link(struct dentry *old, struct inode *dir,
@@ -159,8 +152,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
int err;
mutex_lock(&info->bfs_lock);
- err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
- inode->i_ino);
+ err = bfs_add_entry(dir, &new->d_name, inode->i_ino);
if (err) {
mutex_unlock(&info->bfs_lock);
return err;
@@ -183,7 +175,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
mutex_lock(&info->bfs_lock);
- bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
+ bh = bfs_find_entry(dir, &dentry->d_name, &de);
if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
goto out_brelse;
@@ -228,27 +220,21 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
info = BFS_SB(old_inode->i_sb);
mutex_lock(&info->bfs_lock);
- old_bh = bfs_find_entry(old_dir,
- old_dentry->d_name.name,
- old_dentry->d_name.len, &old_de);
+ old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de);
if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
goto end_rename;
error = -EPERM;
new_inode = d_inode(new_dentry);
- new_bh = bfs_find_entry(new_dir,
- new_dentry->d_name.name,
- new_dentry->d_name.len, &new_de);
+ new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de);
if (new_bh && !new_inode) {
brelse(new_bh);
new_bh = NULL;
}
if (!new_bh) {
- error = bfs_add_entry(new_dir,
- new_dentry->d_name.name,
- new_dentry->d_name.len,
+ error = bfs_add_entry(new_dir, &new_dentry->d_name,
old_inode->i_ino);
if (error)
goto end_rename;
@@ -278,9 +264,10 @@ const struct inode_operations bfs_dir_inops = {
.rename = bfs_rename,
};
-static int bfs_add_entry(struct inode *dir, const unsigned char *name,
- int namelen, int ino)
+static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
{
+ const unsigned char *name = child->name;
+ int namelen = child->len;
struct buffer_head *bh;
struct bfs_dirent *de;
int block, sblock, eblock, off, pos;
@@ -332,12 +319,14 @@ static inline int bfs_namecmp(int len, const unsigned char *name,
}
static struct buffer_head *bfs_find_entry(struct inode *dir,
- const unsigned char *name, int namelen,
+ const struct qstr *child,
struct bfs_dirent **res_dir)
{
unsigned long block = 0, offset = 0;
struct buffer_head *bh = NULL;
struct bfs_dirent *de;
+ const unsigned char *name = child->name;
+ int namelen = child->len;
*res_dir = NULL;
if (namelen > BFS_NAMELEN)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ce1824f47ba6..c3deb2e35f20 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -330,6 +330,7 @@ beyond_if:
#ifdef __alpha__
regs->gp = ex.a_gpvalue;
#endif
+ finalize_exec(bprm);
start_thread(regs, ex.a_entry, current->mm->start_stack);
return 0;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index bdb201230bae..4ad6f669fe34 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
} else
map_addr = vm_mmap(filep, addr, size, prot, type, off);
+ if ((type & MAP_FIXED_NOREPLACE) &&
+ PTR_ERR((void *)map_addr) == -EEXIST)
+ pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n",
+ task_pid_nr(current), current->comm, (void *)addr);
+
return(map_addr);
}
@@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
elf_prot |= PROT_EXEC;
vaddr = eppnt->p_vaddr;
if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
- elf_type |= MAP_FIXED;
+ elf_type |= MAP_FIXED_NOREPLACE;
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
@@ -890,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
- int elf_prot = 0, elf_flags;
+ int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
unsigned long k, vaddr;
unsigned long total_size = 0;
@@ -922,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
}
}
+
+ /*
+ * Some binaries have overlapping elf segments and then
+ * we have to forcefully map over an existing mapping
+ * e.g. over this newly established brk mapping.
+ */
+ elf_fixed = MAP_FIXED;
}
if (elf_ppnt->p_flags & PF_R)
@@ -939,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
* the ET_DYN load_addr calculations, proceed normally.
*/
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
- elf_flags |= MAP_FIXED;
+ elf_flags |= elf_fixed;
} else if (loc->elf_ex.e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
@@ -975,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- elf_flags |= MAP_FIXED;
+ elf_flags |= elf_fixed;
} else
load_bias = 0;
@@ -1155,6 +1167,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
ELF_PLAT_INIT(regs, reloc_func_desc);
#endif
+ finalize_exec(bprm);
start_thread(regs, elf_entry, bprm->p);
retval = 0;
out:
@@ -1234,7 +1247,7 @@ static int load_elf_library(struct file *file)
(eppnt->p_filesz +
ELF_PAGEOFFSET(eppnt->p_vaddr)),
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+ MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE,
(eppnt->p_offset -
ELF_PAGEOFFSET(eppnt->p_vaddr)));
if (error != ELF_PAGESTART(eppnt->p_vaddr))
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 429326b6e2e7..d90993adeffa 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
dynaddr);
#endif
+ finalize_exec(bprm);
/* everything is now ready... get the userspace context ready to roll */
entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
start_thread(regs, entryaddr, current->mm->start_stack);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d6b94475f27..82a48e830018 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
FLAT_PLAT_INIT(regs);
#endif
+ finalize_exec(bprm);
pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n",
regs, start_addr, current->mm->start_stack);
start_thread(regs, start_addr, current->mm->start_stack);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe09ef9c21f3..bef6934b6189 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -272,7 +272,7 @@ struct blkdev_dio {
struct bio bio;
};
-static struct bio_set *blkdev_dio_pool __read_mostly;
+static struct bio_set blkdev_dio_pool;
static void blkdev_bio_end_io(struct bio *bio)
{
@@ -334,7 +334,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
bio_get(bio); /* extra ref for the completion handler */
dio = container_of(bio, struct blkdev_dio, bio);
@@ -432,10 +432,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static __init int blkdev_init(void)
{
- blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
- if (!blkdev_dio_pool)
- return -ENOMEM;
- return 0;
+ return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
}
module_init(blkdev_init);
@@ -1322,25 +1319,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
* check_disk_size_change - checks for disk size change and adjusts bdev size.
* @disk: struct gendisk to check
* @bdev: struct bdev to adjust.
+ * @verbose: if %true log a message about a size change if there is any
*
* This routine checks to see if the bdev size does not match the disk size
- * and adjusts it if it differs.
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
+ bool verbose)
{
loff_t disk_size, bdev_size;
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
- printk(KERN_INFO
- "%s: detected capacity change from %lld to %lld\n",
- disk->disk_name, bdev_size, disk_size);
+ if (verbose) {
+ printk(KERN_INFO
+ "%s: detected capacity change from %lld to %lld\n",
+ disk->disk_name, bdev_size, disk_size);
+ }
i_size_write(bdev->bd_inode, disk_size);
- flush_disk(bdev, false);
+ if (bdev_size > disk_size)
+ flush_disk(bdev, false);
}
}
-EXPORT_SYMBOL(check_disk_size_change);
/**
* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
@@ -1362,7 +1364,7 @@ int revalidate_disk(struct gendisk *disk)
return ret;
mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev);
+ check_disk_size_change(disk, bdev, ret == 0);
bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
@@ -1946,11 +1948,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- if (dax_mapping(mapping)) {
- struct block_device *bdev = I_BDEV(mapping->host);
-
- return dax_writeback_mapping_range(mapping, bdev, wbc);
- }
return generic_writepages(mapping, wbc);
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 167e5dc7eadd..23537bc8c827 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
config BTRFS_FS
tristate "Btrfs filesystem support"
select LIBCRC32C
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0066d95b133f..15e1dfef56a5 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Red Hat. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d5540749f0e5..d522494698fa 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
* Copyright (C) 2014 Fujitsu. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/kthread.h>
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc957e00cef1..7861c9feba5f 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,24 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
* Copyright (C) 2014 Fujitsu. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_ASYNC_THREAD_
-#define __BTRFS_ASYNC_THREAD_
+#ifndef BTRFS_ASYNC_THREAD_H
+#define BTRFS_ASYNC_THREAD_H
+
#include <linux/workqueue.h>
struct btrfs_fs_info;
@@ -85,4 +73,5 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
+
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 571024bc632e..0a8e2e29a66b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 STRATO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/mm.h>
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0a30028d5196..54d58988483a 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2011 STRATO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_BACKREF__
-#define __BTRFS_BACKREF__
+#ifndef BTRFS_BACKREF_H
+#define BTRFS_BACKREF_H
#include <linux/btrfs.h>
#include "ulist.h"
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ca15be569d69..7e075343daa5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_I__
-#define __BTRFS_I__
+#ifndef BTRFS_INODE_H
+#define BTRFS_INODE_H
#include <linux/hash.h>
#include "extent_map.h"
@@ -32,17 +19,17 @@
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
*/
-#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
-#define BTRFS_INODE_ORPHAN_META_RESERVED 1
-#define BTRFS_INODE_DUMMY 2
-#define BTRFS_INODE_IN_DEFRAG 3
-#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
-#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
-#define BTRFS_INODE_NEEDS_FULL_SYNC 6
-#define BTRFS_INODE_COPY_EVERYTHING 7
-#define BTRFS_INODE_IN_DELALLOC_LIST 8
-#define BTRFS_INODE_READDIO_NEED_LOCK 9
-#define BTRFS_INODE_HAS_PROPS 10
+enum {
+ BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+ BTRFS_INODE_DUMMY,
+ BTRFS_INODE_IN_DEFRAG,
+ BTRFS_INODE_HAS_ASYNC_EXTENT,
+ BTRFS_INODE_NEEDS_FULL_SYNC,
+ BTRFS_INODE_COPY_EVERYTHING,
+ BTRFS_INODE_IN_DELALLOC_LIST,
+ BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_INODE_HAS_PROPS,
+};
/* in memory btrfs inode */
struct btrfs_inode {
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 3baebbc021c5..dc062b195c46 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) STRATO AG 2011. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
/*
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 2de58a99ee92..9bf4359cc44c 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) STRATO AG 2011. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#if !defined(__BTRFS_CHECK_INTEGRITY__)
-#define __BTRFS_CHECK_INTEGRITY__
+#ifndef BTRFS_CHECK_INTEGRITY_H
+#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 562c3e633403..d3e447b45bf7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/kernel.h>
@@ -458,7 +445,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
break;
rcu_read_lock();
- page = radix_tree_lookup(&mapping->page_tree, pg_index);
+ page = radix_tree_lookup(&mapping->i_pages, pg_index);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page)) {
misses++;
@@ -1003,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace,
btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(total_ws);
wake:
- /*
- * Make sure counter is updated before we wake up waiters.
- */
- smp_mb();
- if (waitqueue_active(ws_wait))
- wake_up(ws_wait);
+ cond_wake_up(ws_wait);
}
static void free_workspace(int type, struct list_head *ws)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ce796557a918..ddda9b80bf20 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -1,23 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_COMPRESSION_
-#define __BTRFS_COMPRESSION_
+#ifndef BTRFS_COMPRESSION_H
+#define BTRFS_COMPRESSION_H
+
+#include <linux/sizes.h>
/*
* We want to make sure that amount of RAM required to uncompress an extent is
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a2c9d21176e2..4bc326df472e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007,2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -2343,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
no_skips = 1;
t = path->nodes[i];
- if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+ if (i >= lowest_unlock && i > skip_level) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
@@ -2445,14 +2432,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_unlock_up_safe(p, level + 1);
btrfs_set_path_blocking(p);
- free_extent_buffer(tmp);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- btrfs_release_path(p);
-
ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+ tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
&first_key);
if (!IS_ERR(tmp)) {
/*
@@ -2461,12 +2445,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!btrfs_buffer_uptodate(tmp, 0, 0))
+ if (!extent_buffer_uptodate(tmp))
ret = -EIO;
free_extent_buffer(tmp);
} else {
ret = PTR_ERR(tmp);
}
+
+ btrfs_release_path(p);
return ret;
}
@@ -2612,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
return 0;
}
+static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
+ struct btrfs_path *p,
+ int write_lock_level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *b;
+ int root_lock;
+ int level = 0;
+
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
+ if (p->search_commit_root) {
+ /* The commit roots are read only so we always do read locks */
+ if (p->need_commit_sem)
+ down_read(&fs_info->commit_root_sem);
+ b = root->commit_root;
+ extent_buffer_get(b);
+ level = btrfs_header_level(b);
+ if (p->need_commit_sem)
+ up_read(&fs_info->commit_root_sem);
+ /*
+ * Ensure that all callers have set skip_locking when
+ * p->search_commit_root = 1.
+ */
+ ASSERT(p->skip_locking == 1);
+
+ goto out;
+ }
+
+ if (p->skip_locking) {
+ b = btrfs_root_node(root);
+ level = btrfs_header_level(b);
+ goto out;
+ }
+
+ /*
+ * If the level is set to maximum, we can skip trying to get the read
+ * lock.
+ */
+ if (write_lock_level < BTRFS_MAX_LEVEL) {
+ /*
+ * We don't know the level of the root node until we actually
+ * have it read locked
+ */
+ b = btrfs_read_lock_root_node(root);
+ level = btrfs_header_level(b);
+ if (level > write_lock_level)
+ goto out;
+
+ /* Whoops, must trade for write lock */
+ btrfs_tree_read_unlock(b);
+ free_extent_buffer(b);
+ }
+
+ b = btrfs_lock_root_node(root);
+ root_lock = BTRFS_WRITE_LOCK;
+
+ /* The level might have changed, check again */
+ level = btrfs_header_level(b);
+
+out:
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = root_lock;
+ /*
+ * Callers are responsible for dropping b's references.
+ */
+ return b;
+}
+
+
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
* modifications to preserve tree invariants.
@@ -2648,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int err;
int level;
int lowest_unlock = 1;
- int root_lock;
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
@@ -2686,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
- /*
- * we try very hard to do read locks on the root
- */
- root_lock = BTRFS_READ_LOCK;
- level = 0;
- if (p->search_commit_root) {
- /*
- * the commit roots are read only
- * so we always do read locks
- */
- if (p->need_commit_sem)
- down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
- up_read(&fs_info->commit_root_sem);
- if (!p->skip_locking)
- btrfs_tree_read_lock(b);
- } else {
- if (p->skip_locking) {
- b = btrfs_root_node(root);
- level = btrfs_header_level(b);
- } else {
- /* we don't know the level of the root node
- * until we actually have it read locked
- */
- b = btrfs_read_lock_root_node(root);
- level = btrfs_header_level(b);
- if (level <= write_lock_level) {
- /* whoops, must trade for write lock */
- btrfs_tree_read_unlock(b);
- free_extent_buffer(b);
- b = btrfs_lock_root_node(root);
- root_lock = BTRFS_WRITE_LOCK;
-
- /* the level might have changed, check again */
- level = btrfs_header_level(b);
- }
- }
- }
- p->nodes[level] = b;
- if (!p->skip_locking)
- p->locks[level] = root_lock;
+ b = btrfs_search_slot_get_root(root, p, write_lock_level);
while (b) {
level = btrfs_header_level(b);
@@ -5427,12 +5441,24 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
down_read(&fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
- left_path->nodes[left_level] = left_root->commit_root;
+ left_path->nodes[left_level] =
+ btrfs_clone_extent_buffer(left_root->commit_root);
+ if (!left_path->nodes[left_level]) {
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOMEM;
+ goto out;
+ }
extent_buffer_get(left_path->nodes[left_level]);
right_level = btrfs_header_level(right_root->commit_root);
right_root_level = right_level;
- right_path->nodes[right_level] = right_root->commit_root;
+ right_path->nodes[right_level] =
+ btrfs_clone_extent_buffer(right_root->commit_root);
+ if (!right_path->nodes[right_level]) {
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOMEM;
+ goto out;
+ }
extent_buffer_get(right_path->nodes[right_level]);
up_read(&fs_info->commit_root_sem);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0eb55825862a..f4bf7874c24a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_CTREE__
-#define __BTRFS_CTREE__
+#ifndef BTRFS_CTREE_H
+#define BTRFS_CTREE_H
#include <linux/mm.h>
#include <linux/sched/signal.h>
@@ -472,6 +459,25 @@ struct btrfs_block_rsv {
unsigned short full;
unsigned short type;
unsigned short failfast;
+
+ /*
+ * Qgroup equivalent for @size @reserved
+ *
+ * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
+ * about things like csum size nor how many tree blocks it will need to
+ * reserve.
+ *
+ * Qgroup cares more about net change of the extent usage.
+ *
+ * So for one newly inserted file extent, in worst case it will cause
+ * leaf split and level increase, nodesize for each file extent is
+ * already too much.
+ *
+ * In short, qgroup_size/reserved is the upper limit of possible needed
+ * qgroup metadata reservation.
+ */
+ u64 qgroup_rsv_size;
+ u64 qgroup_rsv_reserved;
};
/*
@@ -727,6 +733,18 @@ struct btrfs_delayed_root;
*/
#define BTRFS_FS_EXCL_OP 16
+/*
+ * To info transaction_kthread we need an immediate commit so it doesn't
+ * need to wait for commit_interval
+ */
+#define BTRFS_FS_NEED_ASYNC_COMMIT 17
+
+/*
+ * Indicate that balance has been set up from the ioctl and is in the main
+ * phase. The fs_info::balance_ctl is initialized.
+ */
+#define BTRFS_FS_BALANCE_RUNNING 18
+
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -826,7 +844,6 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
- struct mutex volume_mutex;
/*
* this is taken to make sure we don't set block groups ro after
@@ -992,7 +1009,6 @@ struct btrfs_fs_info {
/* restriper state */
spinlock_t balance_lock;
struct mutex balance_mutex;
- atomic_t balance_running;
atomic_t balance_pause_req;
atomic_t balance_cancel_req;
struct btrfs_balance_control *balance_ctl;
@@ -1207,9 +1223,6 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
- spinlock_t orphan_lock;
- atomic_t orphan_inodes;
- struct btrfs_block_rsv *orphan_block_rsv;
int orphan_cleanup_state;
spinlock_t inode_lock;
@@ -2752,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode,
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode);
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
- int nitems,
- u64 *qgroup_reserved, bool use_global_rsv);
+ int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
@@ -2816,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, const u64 type);
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end);
+ u64 start, u64 end);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
@@ -3030,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
/* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@ -3151,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct extent_map *em);
/* inode.c */
-struct btrfs_delalloc_work {
- struct inode *inode;
- int delay_iput;
- struct completion completion;
- struct list_head list;
- struct btrfs_work work;
-};
-
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int delay_iput);
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
-
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start,
u64 len, int create);
@@ -3170,6 +3165,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes);
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -3179,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index);
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *dir, u64 objectid,
- const char *name, int name_len);
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
@@ -3190,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
- int nr);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state, int dedupe);
@@ -3226,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -3248,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode);
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_update_iflags(struct inode *inode);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int btrfs_is_empty_uuid(u8 *uuid);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff);
@@ -3752,4 +3742,27 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
#endif
return 0;
}
+
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+ /*
+ * This implies a full smp_mb barrier, see comments for
+ * waitqueue_active why.
+ */
+ if (wq_has_sleeper(wq))
+ wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+ /*
+ * Special case for conditional wakeup where the barrier required for
+ * waitqueue_active is implied by some of the preceding code. Eg. one
+ * of such atomic operations (atomic_dec_and_return, ...), or a
+ * unlock/lock sequence, etc.
+ */
+ if (waitqueue_active(wq))
+ wake_up(wq);
+}
+
#endif
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 83ebfe28da9e..90281a7a35a8 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -1,24 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2016 Fujitsu. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_DEDUPE__
-#define __BTRFS_DEDUPE__
+#ifndef BTRFS_DEDUPE_H
+#define BTRFS_DEDUPE_H
/* later in-band dedupe will expand this struct */
struct btrfs_dedupe_hash;
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 86ec2edc05e8..fe6caa7e698b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 Fujitsu. All rights reserved.
* Written by Miao Xie <miaox@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/slab.h>
@@ -473,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
- /*
- * atomic_dec_return implies a barrier for waitqueue_active
- */
+ /* atomic_dec_return implies a barrier */
if ((atomic_dec_return(&delayed_root->items) <
- BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
- waitqueue_active(&delayed_root->wait))
- wake_up(&delayed_root->wait);
+ BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
+ cond_wake_up_nomb(&delayed_root->wait);
}
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
@@ -569,6 +553,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
dst_rsv = &fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+
+ /*
+ * Here we migrate space rsv from transaction rsv, since have already
+ * reserved space when starting a transaction. So no need to reserve
+ * qgroup space here.
+ */
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
@@ -590,7 +580,10 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
return;
rsv = &fs_info->delayed_block_rsv;
- btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved);
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
+ * to release/reserve qgroup space.
+ */
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
@@ -615,9 +608,6 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
- if (ret < 0)
- return ret;
/*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
@@ -629,6 +619,10 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+ ret = btrfs_qgroup_reserve_meta_prealloc(root,
+ fs_info->nodesize, true);
+ if (ret < 0)
+ return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
/*
@@ -647,6 +641,8 @@ static int btrfs_delayed_inode_reserve_metadata(
"delayed_inode",
btrfs_ino(inode),
num_bytes, 1);
+ } else {
+ btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize);
}
return ret;
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 100a91e26b55..ca7a97f3ab6b 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -1,24 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2011 Fujitsu. All rights reserved.
* Written by Miao Xie <miaox@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __DELAYED_TREE_OPERATION_H
-#define __DELAYED_TREE_OPERATION_H
+#ifndef BTRFS_DELAYED_INODE_H
+#define BTRFS_DELAYED_INODE_H
#include <linux/rbtree.h>
#include <linux/spinlock.h>
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2677257c149d..03dec673d12a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2009 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -299,10 +286,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
}
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
u64 seq = 0;
@@ -336,9 +323,7 @@ again:
}
}
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq)
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
struct seq_list *elem;
int ret = 0;
@@ -349,10 +334,9 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct seq_list, list);
if (seq >= elem->seq) {
btrfs_debug(fs_info,
- "holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)",
+ "holding back delayed_ref %#x.%x, lowest is %#x.%x",
(u32)(seq >> 32), (u32)seq,
- (u32)(elem->seq >> 32), (u32)elem->seq,
- delayed_refs);
+ (u32)(elem->seq >> 32), (u32)elem->seq);
ret = 1;
}
}
@@ -542,31 +526,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
spin_unlock(&existing->lock);
}
-/*
- * helper function to actually insert a head node into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
- */
-static noinline struct btrfs_delayed_ref_head *
-add_delayed_ref_head(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head_ref,
- struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
- int action, int is_data, int *qrecord_inserted_ret,
- int *old_ref_mod, int *new_ref_mod)
+static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, u64 ref_root,
+ u64 reserved, int action, bool is_data,
+ bool is_system)
{
- struct btrfs_delayed_ref_head *existing;
- struct btrfs_delayed_ref_root *delayed_refs;
int count_mod = 1;
int must_insert_reserved = 0;
- int qrecord_inserted = 0;
/* If reserved is provided, it must be a data extent. */
BUG_ON(!is_data && reserved);
/*
- * the head node stores the sum of all the mods, so dropping a ref
+ * The head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
*/
if (action == BTRFS_UPDATE_DELAYED_HEAD)
@@ -575,12 +548,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
count_mod = -1;
/*
- * BTRFS_ADD_DELAYED_EXTENT means that we need to update
- * the reserved accounting when the extent is finally added, or
- * if a later modification deletes the delayed ref without ever
- * inserting the extent into the extent allocation tree.
- * ref->must_insert_reserved is the flag used to record
- * that accounting mods are required.
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
+ * accounting when the extent is finally added, or if a later
+ * modification deletes the delayed ref without ever inserting the
+ * extent into the extent allocation tree. ref->must_insert_reserved
+ * is the flag used to record that accounting mods are required.
*
* Once we record must_insert_reserved, switch the action to
* BTRFS_ADD_DELAYED_REF because other special casing is not required.
@@ -590,14 +562,13 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
else
must_insert_reserved = 0;
- delayed_refs = &trans->transaction->delayed_refs;
-
refcount_set(&head_ref->refs, 1);
head_ref->bytenr = bytenr;
head_ref->num_bytes = num_bytes;
head_ref->ref_mod = count_mod;
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
+ head_ref->is_system = is_system;
head_ref->ref_tree = RB_ROOT;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
@@ -608,7 +579,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
- /* Record qgroup extent info if provided */
if (qrecord) {
if (ref_root && reserved) {
head_ref->qgroup_ref_root = ref_root;
@@ -618,20 +588,44 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
+ }
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ int action, int *qrecord_inserted_ret,
+ int *old_ref_mod, int *new_ref_mod)
+{
+ struct btrfs_delayed_ref_head *existing;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int qrecord_inserted = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
- if(btrfs_qgroup_trace_extent_nolock(fs_info,
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
delayed_refs, qrecord))
kfree(qrecord);
else
qrecord_inserted = 1;
}
- trace_add_delayed_ref_head(fs_info, head_ref, action);
+ trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+ WARN_ON(qrecord && head_ref->qgroup_ref_root
+ && head_ref->qgroup_reserved
+ && existing->qgroup_ref_root
&& existing->qgroup_reserved);
update_existing_head_ref(delayed_refs, existing, head_ref,
old_ref_mod);
@@ -644,8 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
} else {
if (old_ref_mod)
*old_ref_mod = 0;
- if (is_data && count_mod < 0)
- delayed_refs->pending_csums += num_bytes;
+ if (head_ref->is_data && head_ref->ref_mod < 0)
+ delayed_refs->pending_csums += head_ref->num_bytes;
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -655,90 +649,48 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
*qrecord_inserted_ret = qrecord_inserted;
if (new_ref_mod)
*new_ref_mod = head_ref->total_ref_mod;
- return head_ref;
-}
-
-/*
- * helper to insert a delayed tree ref into the rbtree.
- */
-static noinline void
-add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head_ref,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action)
-{
- struct btrfs_delayed_tree_ref *full_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- u64 seq = 0;
- int ret;
-
- if (action == BTRFS_ADD_DELAYED_EXTENT)
- action = BTRFS_ADD_DELAYED_REF;
-
- if (is_fstree(ref_root))
- seq = atomic64_read(&fs_info->tree_mod_seq);
- delayed_refs = &trans->transaction->delayed_refs;
- /* first set the basic ref node struct up */
- refcount_set(&ref->refs, 1);
- ref->bytenr = bytenr;
- ref->num_bytes = num_bytes;
- ref->ref_mod = 1;
- ref->action = action;
- ref->is_head = 0;
- ref->in_tree = 1;
- ref->seq = seq;
- RB_CLEAR_NODE(&ref->ref_node);
- INIT_LIST_HEAD(&ref->add_list);
-
- full_ref = btrfs_delayed_node_to_tree_ref(ref);
- full_ref->parent = parent;
- full_ref->root = ref_root;
- if (parent)
- ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
- else
- ref->type = BTRFS_TREE_BLOCK_REF_KEY;
- full_ref->level = level;
-
- trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
-
- ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
-
- /*
- * XXX: memory should be freed at the same level allocated.
- * But bad practice is anywhere... Follow it now. Need cleanup.
- */
- if (ret > 0)
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
+ return head_ref;
}
/*
- * helper to insert a delayed data ref into the rbtree.
+ * init_delayed_ref_common - Initialize the structure which represents a
+ * modification to a an extent.
+ *
+ * @fs_info: Internal to the mounted filesystem mount structure.
+ *
+ * @ref: The structure which is going to be initialized.
+ *
+ * @bytenr: The logical address of the extent for which a modification is
+ * going to be recorded.
+ *
+ * @num_bytes: Size of the extent whose modification is being recorded.
+ *
+ * @ref_root: The id of the root where this modification has originated, this
+ * can be either one of the well-known metadata trees or the
+ * subvolume id which references this extent.
+ *
+ * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
+ * BTRFS_ADD_DELAYED_EXTENT
+ *
+ * @ref_type: Holds the type of the extent which is being recorded, can be
+ * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
+ * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
+ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent
*/
-static noinline void
-add_delayed_data_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head_ref,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action)
+static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 ref_root,
+ int action, u8 ref_type)
{
- struct btrfs_delayed_data_ref *full_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
- int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- delayed_refs = &trans->transaction->delayed_refs;
-
if (is_fstree(ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
- /* first set the basic ref node struct up */
refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
@@ -747,25 +699,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
+ ref->type = ref_type;
RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
-
- full_ref = btrfs_delayed_node_to_data_ref(ref);
- full_ref->parent = parent;
- full_ref->root = ref_root;
- if (parent)
- ref->type = BTRFS_SHARED_DATA_REF_KEY;
- else
- ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-
- full_ref->objectid = owner;
- full_ref->offset = offset;
-
- trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
-
- ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
- if (ret > 0)
- kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
}
/*
@@ -785,12 +721,25 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
+ bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+ int ret;
+ u8 ref_type;
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
+ if (parent)
+ ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ ref_type = BTRFS_TREE_BLOCK_REF_KEY;
+ init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+ ref_root, action, ref_type);
+ ref->root = ref_root;
+ ref->parent = parent;
+ ref->level = level;
+
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
goto free_ref;
@@ -802,6 +751,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
goto free_head_ref;
}
+ init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+ ref_root, 0, action, false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -811,15 +762,19 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
- bytenr, num_bytes, 0, 0, action, 0,
- &qrecord_inserted, old_ref_mod,
- new_ref_mod);
+ head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted,
+ old_ref_mod, new_ref_mod);
- add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, level, action);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
+ action == BTRFS_ADD_DELAYED_EXTENT ?
+ BTRFS_ADD_DELAYED_REF : action);
+ if (ret > 0)
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
if (qrecord_inserted)
btrfs_qgroup_trace_extent_post(fs_info, record);
@@ -848,11 +803,25 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
+ int ret;
+ u8 ref_type;
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
+ if (parent)
+ ref_type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ ref_type = BTRFS_EXTENT_DATA_REF_KEY;
+ init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+ ref_root, action, ref_type);
+ ref->root = ref_root;
+ ref->parent = parent;
+ ref->objectid = owner;
+ ref->offset = offset;
+
+
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -870,6 +839,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
}
}
+ init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
+ reserved, action, true, false);
head_ref->extent_op = NULL;
delayed_refs = &trans->transaction->delayed_refs;
@@ -879,16 +850,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
- bytenr, num_bytes, ref_root, reserved,
- action, 1, &qrecord_inserted,
+ head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted,
old_ref_mod, new_ref_mod);
- add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, owner, offset,
- action);
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
+ trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
+ action == BTRFS_ADD_DELAYED_EXTENT ?
+ BTRFS_ADD_DELAYED_REF : action);
+ if (ret > 0)
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+
+
if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(fs_info, record);
return 0;
@@ -906,14 +881,16 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
if (!head_ref)
return -ENOMEM;
+ init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
+ false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
- num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data, NULL, NULL, NULL);
+ add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
+ NULL, NULL, NULL);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 9e3e5aff0937..ea1aecb6a50d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __DELAYED_REF__
-#define __DELAYED_REF__
+
+#ifndef BTRFS_DELAYED_REF_H
+#define BTRFS_DELAYED_REF_H
#include <linux/refcount.h>
@@ -139,6 +127,7 @@ struct btrfs_delayed_ref_head {
*/
unsigned int must_insert_reserved:1;
unsigned int is_data:1;
+ unsigned int is_system:1;
unsigned int processing:1;
};
@@ -262,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
@@ -280,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
struct btrfs_delayed_ref_head *
btrfs_select_ref_head(struct btrfs_trans_handle *trans);
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq);
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
/*
* helper functions to cast a node into its container
@@ -298,4 +284,5 @@ btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
{
return container_of(node, struct btrfs_delayed_data_ref, node);
}
+
#endif
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0d203633bb96..e2ba0419297a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) STRATO AG 2012. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
@@ -45,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
-
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
@@ -191,6 +177,105 @@ out:
}
/*
+ * Initialize a new device for device replace target from a given source dev
+ * and path.
+ *
+ * Return 0 and new device in @device_out, otherwise return < 0
+ */
+static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ const char *device_path,
+ struct btrfs_device *srcdev,
+ struct btrfs_device **device_out)
+{
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct list_head *devices;
+ struct rcu_string *name;
+ u64 devid = BTRFS_DEV_REPLACE_DEVID;
+ int ret = 0;
+
+ *device_out = NULL;
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+ return -EINVAL;
+ }
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ fs_info->bdev_holder);
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
+ return PTR_ERR(bdev);
+ }
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
+ if (device->bdev == bdev) {
+ btrfs_err(fs_info,
+ "target device is in the filesystem!");
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info,
+ "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
+ device = btrfs_alloc_device(NULL, &devid, NULL);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ device->generation = 0;
+ device->io_width = fs_info->sectorsize;
+ device->io_align = fs_info->sectorsize;
+ device->sector_size = fs_info->sectorsize;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+ device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
+ set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ device->fs_devices = fs_info->fs_devices;
+ list_add(&device->dev_list, &fs_info->fs_devices->devices);
+ fs_info->fs_devices->num_devices++;
+ fs_info->fs_devices->open_devices++;
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ *device_out = device;
+ return 0;
+
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ return ret;
+}
+
+/*
* called from commit_transaction. Writes changed device replace state to
* disk.
*/
@@ -329,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
- /* the disk copy procedure reuses the scrub code */
- mutex_lock(&fs_info->volume_mutex);
ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name, &src_device);
- if (ret) {
- mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
return ret;
- }
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
- mutex_unlock(&fs_info->volume_mutex);
if (ret)
return ret;
@@ -372,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->cont_reading_from_srcdev_mode = read_src;
WARN_ON(!src_device);
dev_replace->srcdev = src_device;
- WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
btrfs_info_in_rcu(fs_info,
@@ -515,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -530,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
- mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
@@ -557,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- mutex_unlock(&uuid_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -608,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
*/
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
@@ -812,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
}
btrfs_dev_replace_write_unlock(dev_replace);
- WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+ /*
+ * This could collide with a paused balance, but the exclusive op logic
+ * should never allow both to start and pause. We don't want to allow
+ * dev-replace to start anyway.
+ */
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ btrfs_info(fs_info,
+ "cannot resume dev-replace, other exclusive operation running");
+ return 0;
+ }
+
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
@@ -822,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 progress;
+ int ret;
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
@@ -832,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
- btrfs_dev_replace_continue_on_mount(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-
- return 0;
-}
-
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int ret;
-
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
+
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
@@ -928,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking(
ASSERT(atomic_read(&dev_replace->read_locks) > 0);
ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
read_lock(&dev_replace->lock);
- if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
- waitqueue_active(&dev_replace->read_lock_wq))
- wake_up(&dev_replace->read_lock_wq);
+ /* Barrier implied by atomic_dec_and_test */
+ if (atomic_dec_and_test(&dev_replace->blocking_readers))
+ cond_wake_up_nomb(&dev_replace->read_lock_wq);
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
@@ -941,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->bio_counter, amount);
-
- if (waitqueue_active(&fs_info->replace_wait))
- wake_up(&fs_info->replace_wait);
+ cond_wake_up_nomb(&fs_info->replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 8566a02ef222..b6d4206188bb 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) STRATO AG 2012. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#if !defined(__BTRFS_DEV_REPLACE__)
-#define __BTRFS_DEV_REPLACE__
+#ifndef BTRFS_DEV_REPLACE_H
+#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
@@ -48,4 +35,5 @@ static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
{
atomic64_inc(stat_value);
}
+
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 29e967b2c667..39e9766d1cbd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include "ctree.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b5e6f7df67..205092dc9390 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -68,7 +55,6 @@
static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -429,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
static int verify_level_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int level,
- struct btrfs_key *first_key)
+ struct btrfs_key *first_key, u64 parent_transid)
{
int found_level;
struct btrfs_key found_key;
@@ -449,6 +435,14 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
if (!first_key)
return 0;
+ /*
+ * For live tree block (new tree blocks in current transaction),
+ * we need proper lock context to avoid race, which is impossible here.
+ * So we only checks tree blocks which is read from disk, whose
+ * generation <= fs_info->last_trans_committed.
+ */
+ if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+ return 0;
if (found_level)
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
@@ -459,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
if (ret) {
WARN_ON(1);
btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
- eb->start, first_key->objectid, first_key->type,
- first_key->offset, found_key.objectid,
- found_key.type, found_key.offset);
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, parent_transid, first_key->objectid,
+ first_key->type, first_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
}
#endif
return ret;
@@ -498,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
parent_transid, 0))
ret = -EIO;
else if (verify_level_key(fs_info, eb, level,
- first_key))
+ first_key, parent_transid))
ret = -EUCLEAN;
else
break;
@@ -1190,7 +1185,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
root->block_rsv = NULL;
- root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
@@ -1200,7 +1194,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
- spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
@@ -1221,7 +1214,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
- atomic_set(&root->orphan_inodes, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
root->log_transid = 0;
@@ -1829,6 +1821,7 @@ static int transaction_kthread(void *arg)
now = get_seconds();
if (cur->state < TRANS_STATE_BLOCKED &&
+ !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
spin_unlock(&fs_info->trans_lock);
@@ -2168,7 +2161,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
spin_lock_init(&fs_info->balance_lock);
mutex_init(&fs_info->balance_mutex);
- atomic_set(&fs_info->balance_running, 0);
atomic_set(&fs_info->balance_pause_req, 0);
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
@@ -2446,6 +2438,211 @@ out:
return ret;
}
+/*
+ * Real super block validation
+ * NOTE: super csum type and incompat features will not be checked here.
+ *
+ * @sb: super block to check
+ * @mirror_num: the super block number to check its bytenr:
+ * 0 the primary (1st) sb
+ * 1, 2 2nd and 3rd backup copy
+ * -1 skip bytenr check
+ */
+static int validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
+{
+ u64 nodesize = btrfs_super_nodesize(sb);
+ u64 sectorsize = btrfs_super_sectorsize(sb);
+ int ret = 0;
+
+ if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+ btrfs_err(fs_info, "no valid FS found");
+ ret = -EINVAL;
+ }
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+ btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "tree_root level too big: %d >= %d",
+ btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
+ btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "log_root level too big: %d >= %d",
+ btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Check sectorsize and nodesize first, other check will need it.
+ * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
+ */
+ if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
+ ret = -EINVAL;
+ }
+ /* Only PAGE SIZE is supported yet */
+ if (sectorsize != PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "sectorsize %llu not supported yet, only support %lu",
+ sectorsize, PAGE_SIZE);
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+ nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
+ ret = -EINVAL;
+ }
+ if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+ btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
+ le32_to_cpu(sb->__unused_leafsize), nodesize);
+ ret = -EINVAL;
+ }
+
+ /* Root alignment check */
+ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "tree_root block unaligned: %llu",
+ btrfs_super_root(sb));
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
+ btrfs_super_chunk_root(sb));
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "log_root block unaligned: %llu",
+ btrfs_super_log_root(sb));
+ ret = -EINVAL;
+ }
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+ btrfs_err(fs_info,
+ "dev_item UUID does not match fsid: %pU != %pU",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+ btrfs_err(fs_info, "bytes_used is too small %llu",
+ btrfs_super_bytes_used(sb));
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(btrfs_super_stripesize(sb))) {
+ btrfs_err(fs_info, "invalid stripesize %u",
+ btrfs_super_stripesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_num_devices(sb) > (1UL << 31))
+ btrfs_warn(fs_info, "suspicious number of devices: %llu",
+ btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ btrfs_err(fs_info, "number of devices is 0");
+ ret = -EINVAL;
+ }
+
+ if (mirror_num >= 0 &&
+ btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
+ btrfs_err(fs_info, "super offset mismatch %llu != %u",
+ btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ btrfs_err(fs_info, "system chunk array too big %u > %u",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ btrfs_err(fs_info, "system chunk array too small %u < %zu",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+ btrfs_warn(fs_info,
+ "suspicious: generation < chunk_root_generation: %llu < %llu",
+ btrfs_super_generation(sb),
+ btrfs_super_chunk_root_generation(sb));
+ if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+ && btrfs_super_cache_generation(sb) != (u64)-1)
+ btrfs_warn(fs_info,
+ "suspicious: generation < cache_generation: %llu < %llu",
+ btrfs_super_generation(sb),
+ btrfs_super_cache_generation(sb));
+
+ return ret;
+}
+
+/*
+ * Validation of super block at mount time.
+ * Some checks already done early at mount time, like csum type and incompat
+ * flags will be skipped.
+ */
+static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
+{
+ return validate_super(fs_info, fs_info->super_copy, 0);
+}
+
+/*
+ * Validation of super block at write time.
+ * Some checks like bytenr check will be skipped as their values will be
+ * overwritten soon.
+ * Extra checks like csum type and incompat flags will be done here.
+ */
+static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb)
+{
+ int ret;
+
+ ret = validate_super(fs_info, sb, -1);
+ if (ret < 0)
+ goto out;
+ if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info, "invalid csum type, has %u want %u",
+ btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
+ goto out;
+ }
+ if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "invalid incompat flags, has 0x%llx valid mask 0x%llx",
+ btrfs_super_incompat_flags(sb),
+ (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
+ goto out;
+ }
+out:
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "super block corruption detected before writing it to disk");
+ return ret;
+}
+
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -2605,7 +2802,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
- mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
@@ -2672,7 +2868,7 @@ int open_ctree(struct super_block *sb,
memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
- ret = btrfs_check_super_valid(fs_info);
+ ret = btrfs_validate_mount_super(fs_info);
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
@@ -3527,7 +3723,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
if (raid_type == BTRFS_RAID_SINGLE)
continue;
- if (!(flags & btrfs_raid_group[raid_type]))
+ if (!(flags & btrfs_raid_array[raid_type].bg_flag))
continue;
min_tolerated = min(min_tolerated,
btrfs_raid_array[raid_type].
@@ -3607,6 +3803,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
flags = btrfs_super_flags(sb);
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+ ret = btrfs_validate_write_super(fs_info, sb);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_handle_fs_error(fs_info, -EUCLEAN,
+ "unexpected superblock corruption detected");
+ return -EUCLEAN;
+ }
+
ret = write_dev_supers(dev, sb, max_mirrors);
if (ret)
total_errors++;
@@ -3678,8 +3882,6 @@ static void free_fs_root(struct btrfs_root *root)
{
iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
- root->orphan_block_rsv = NULL;
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
if (root->subv_writers)
@@ -3770,7 +3972,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
void close_ctree(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
@@ -3812,7 +4013,8 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
+ test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
@@ -3821,6 +4023,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
btrfs_free_qgroup_config(fs_info);
+ ASSERT(list_empty(&fs_info->delalloc_roots));
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
btrfs_info(fs_info, "at unmount delalloc count %lld",
@@ -3864,9 +4067,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfs_free_stripe_hash_table(fs_info);
btrfs_free_ref_cache(fs_info);
- __btrfs_free_block_rsv(root->orphan_block_rsv);
- root->orphan_block_rsv = NULL;
-
while (!list_empty(&fs_info->pinned_chunks)) {
struct extent_map *em;
@@ -3977,166 +4177,17 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
level, first_key);
}
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_super_block *sb = fs_info->super_copy;
- u64 nodesize = btrfs_super_nodesize(sb);
- u64 sectorsize = btrfs_super_sectorsize(sb);
- int ret = 0;
-
- if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
- btrfs_err(fs_info, "no valid FS found");
- ret = -EINVAL;
- }
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
- btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
- btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
- ret = -EINVAL;
- }
- if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "tree_root level too big: %d >= %d",
- btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
- ret = -EINVAL;
- }
- if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
- btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
- ret = -EINVAL;
- }
- if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "log_root level too big: %d >= %d",
- btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
- ret = -EINVAL;
- }
-
- /*
- * Check sectorsize and nodesize first, other check will need it.
- * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
- */
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
- sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
- btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
- ret = -EINVAL;
- }
- /* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_SIZE) {
- btrfs_err(fs_info,
- "sectorsize %llu not supported yet, only support %lu",
- sectorsize, PAGE_SIZE);
- ret = -EINVAL;
- }
- if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
- nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
- btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
- ret = -EINVAL;
- }
- if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
- btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
- le32_to_cpu(sb->__unused_leafsize), nodesize);
- ret = -EINVAL;
- }
-
- /* Root alignment check */
- if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
- btrfs_warn(fs_info, "tree_root block unaligned: %llu",
- btrfs_super_root(sb));
- ret = -EINVAL;
- }
- if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
- btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
- btrfs_super_chunk_root(sb));
- ret = -EINVAL;
- }
- if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
- btrfs_warn(fs_info, "log_root block unaligned: %llu",
- btrfs_super_log_root(sb));
- ret = -EINVAL;
- }
-
- if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
- btrfs_err(fs_info,
- "dev_item UUID does not match fsid: %pU != %pU",
- fs_info->fsid, sb->dev_item.fsid);
- ret = -EINVAL;
- }
-
- /*
- * Hint to catch really bogus numbers, bitflips or so, more exact checks are
- * done later
- */
- if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
- btrfs_err(fs_info, "bytes_used is too small %llu",
- btrfs_super_bytes_used(sb));
- ret = -EINVAL;
- }
- if (!is_power_of_2(btrfs_super_stripesize(sb))) {
- btrfs_err(fs_info, "invalid stripesize %u",
- btrfs_super_stripesize(sb));
- ret = -EINVAL;
- }
- if (btrfs_super_num_devices(sb) > (1UL << 31))
- btrfs_warn(fs_info, "suspicious number of devices: %llu",
- btrfs_super_num_devices(sb));
- if (btrfs_super_num_devices(sb) == 0) {
- btrfs_err(fs_info, "number of devices is 0");
- ret = -EINVAL;
- }
-
- if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
- btrfs_err(fs_info, "super offset mismatch %llu != %u",
- btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
- ret = -EINVAL;
- }
-
- /*
- * Obvious sys_chunk_array corruptions, it must hold at least one key
- * and one chunk
- */
- if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
- btrfs_err(fs_info, "system chunk array too big %u > %u",
- btrfs_super_sys_array_size(sb),
- BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
- ret = -EINVAL;
- }
- if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
- + sizeof(struct btrfs_chunk)) {
- btrfs_err(fs_info, "system chunk array too small %u < %zu",
- btrfs_super_sys_array_size(sb),
- sizeof(struct btrfs_disk_key)
- + sizeof(struct btrfs_chunk));
- ret = -EINVAL;
- }
-
- /*
- * The generation is a global counter, we'll trust it more than the others
- * but it's still possible that it's the one that's wrong.
- */
- if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
- btrfs_warn(fs_info,
- "suspicious: generation < chunk_root_generation: %llu < %llu",
- btrfs_super_generation(sb),
- btrfs_super_chunk_root_generation(sb));
- if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
- && btrfs_super_cache_generation(sb) != (u64)-1)
- btrfs_warn(fs_info,
- "suspicious: generation < cache_generation: %llu < %llu",
- btrfs_super_generation(sb),
- btrfs_super_cache_generation(sb));
-
- return ret;
-}
-
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
+ /* cleanup FS via transaction */
+ btrfs_cleanup_transaction(fs_info);
+
mutex_lock(&fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
-
- /* cleanup FS via transaction */
- btrfs_cleanup_transaction(fs_info);
}
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
@@ -4261,19 +4312,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
+ struct inode *inode = NULL;
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
delalloc_inodes);
-
- list_del_init(&btrfs_inode->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &btrfs_inode->runtime_flags);
+ __btrfs_del_delalloc_inode(root, btrfs_inode);
spin_unlock(&root->delalloc_lock);
- btrfs_invalidate_inodes(btrfs_inode->root);
-
+ /*
+ * Make sure we get a live inode and that it'll not disappear
+ * meanwhile.
+ */
+ inode = igrab(&btrfs_inode->vfs_inode);
+ if (inode) {
+ invalidate_inode_pages2(inode->i_mapping);
+ iput(inode);
+ }
spin_lock(&root->delalloc_lock);
}
-
spin_unlock(&root->delalloc_lock);
}
@@ -4289,7 +4344,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- list_del_init(&root->delalloc_root);
root = btrfs_grab_fs_root(root);
BUG_ON(!root);
spin_unlock(&fs_info->delalloc_root_lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 453ea9f5d4e9..1a3d277b027b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __DISKIO__
-#define __DISKIO__
+#ifndef BTRFS_DISK_IO_H
+#define BTRFS_DISK_IO_H
#define BTRFS_SUPER_INFO_OFFSET SZ_64K
#define BTRFS_SUPER_INFO_SIZE 4096
@@ -169,4 +156,5 @@ static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
{
}
#endif
+
#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ddaccad469f8..1f3755b3a37a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
#include <linux/fs.h>
#include <linux/types.h>
#include "ctree.h"
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 91b3908e7c54..57488ecd7d4e 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
+
#ifndef BTRFS_EXPORT_H
#define BTRFS_EXPORT_H
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e08d0d45af4f..3d9fe58c0080 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/pagemap.h>
@@ -78,10 +66,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- u64 parent, u64 root_objectid,
- u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins);
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 flags,
int force);
@@ -268,7 +254,7 @@ static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
ret = btrfs_rmap_block(fs_info, cache->key.objectid,
- bytenr, 0, &logical, &nr, &stripe_len);
+ bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -355,8 +341,9 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group)
* since their free space will be released as soon as the transaction commits.
*/
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+ u64 start, u64 end)
{
+ struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -501,8 +488,7 @@ next:
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY) {
- total_found += add_new_free_space(block_group,
- fs_info, last,
+ total_found += add_new_free_space(block_group, last,
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
@@ -520,7 +506,7 @@ next:
}
ret = 0;
- total_found += add_new_free_space(block_group, fs_info, last,
+ total_found += add_new_free_space(block_group, last,
block_group->key.objectid +
block_group->key.offset);
caching_ctl->progress = (u64)-1;
@@ -756,12 +742,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
}
static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
- u64 owner, u64 root_objectid)
+ bool metadata, u64 root_objectid)
{
struct btrfs_space_info *space_info;
u64 flags;
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (metadata) {
if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
@@ -2212,8 +2198,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
&old_ref_mod, &new_ref_mod);
}
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
+ if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
+ bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+ add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
+ }
return ret;
}
@@ -2440,10 +2429,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_delayed_tree_ref *ref;
- struct btrfs_key ins;
u64 parent = 0;
u64 ref_root = 0;
- bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
@@ -2452,15 +2439,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent = ref->parent;
ref_root = ref->root;
- ins.objectid = node->bytenr;
- if (skinny_metadata) {
- ins.offset = ref->level;
- ins.type = BTRFS_METADATA_ITEM_KEY;
- } else {
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- }
-
if (node->ref_mod != 1) {
btrfs_err(fs_info,
"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
@@ -2470,11 +2448,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
- ret = alloc_reserved_tree_block(trans, fs_info,
- parent, ref_root,
- extent_op->flags_to_set,
- &extent_op->key,
- ref->level, &ins);
+ ret = alloc_reserved_tree_block(trans, node, extent_op);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, fs_info, node,
parent, ref_root,
@@ -2606,20 +2580,26 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
delayed_refs->num_heads--;
rb_erase(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
- spin_unlock(&delayed_refs->lock);
spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
atomic_dec(&delayed_refs->num_entries);
trace_run_delayed_ref_head(fs_info, head, 0);
if (head->total_ref_mod < 0) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_space_info *space_info;
+ u64 flags;
- cache = btrfs_lookup_block_group(fs_info, head->bytenr);
- ASSERT(cache);
- percpu_counter_add(&cache->space_info->total_bytes_pinned,
+ if (head->is_data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (head->is_system)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ space_info = __find_space_info(fs_info, flags);
+ ASSERT(space_info);
+ percpu_counter_add(&space_info->total_bytes_pinned,
-head->num_bytes);
- btrfs_put_block_group(cache);
if (head->is_data) {
spin_lock(&delayed_refs->lock);
@@ -2706,17 +2686,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* insert_inline_extent_backref()).
*/
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
- locked_ref);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
- /*
- * locked_ref is the head node, so we have to go one
- * node back for any delayed ref updates
- */
ref = select_delayed_ref(locked_ref);
if (ref && ref->seq &&
- btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+ btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
locked_ref = NULL;
@@ -3148,7 +3123,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
struct rb_node *node;
int ret = 0;
+ spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
+ if (cur_trans)
+ refcount_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
if (!cur_trans)
return 0;
@@ -3157,6 +3136,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (!head) {
spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
return 0;
}
@@ -3173,6 +3153,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
+ btrfs_put_transaction(cur_trans);
return -EAGAIN;
}
spin_unlock(&delayed_refs->lock);
@@ -3205,6 +3186,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
+ btrfs_put_transaction(cur_trans);
return ret;
}
@@ -3290,7 +3272,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
path = btrfs_alloc_path();
if (!path)
- return -ENOENT;
+ return -ENOMEM;
do {
ret = check_committed_ref(root, path, objectid,
@@ -4025,8 +4007,7 @@ static const char *alloc_name(u64 flags)
};
}
-static int create_space_info(struct btrfs_fs_info *info, u64 flags,
- struct btrfs_space_info **new)
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
struct btrfs_space_info *space_info;
@@ -4064,7 +4045,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags,
return ret;
}
- *new = space_info;
list_add_rcu(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -4121,7 +4101,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* returns target flags in extended format or 0 if restripe for this
* chunk_type is not in progress
*
- * should be called with either volume_mutex or balance_lock held
+ * should be called with balance_lock held
*/
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{
@@ -4177,7 +4157,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
/* First, mask out the RAID levels which aren't possible */
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
if (num_devices >= btrfs_raid_array[raid_type].devs_min)
- allowed |= btrfs_raid_group[raid_type];
+ allowed |= btrfs_raid_array[raid_type].bg_flag;
}
allowed &= flags;
@@ -4340,7 +4320,7 @@ commit_trans:
need_commit--;
if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, 0, -1);
+ btrfs_start_delalloc_roots(fs_info, -1);
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
(u64)-1);
}
@@ -4642,6 +4622,7 @@ again:
if (wait_for_alloc) {
mutex_unlock(&fs_info->chunk_mutex);
wait_for_alloc = 0;
+ cond_resched();
goto again;
}
@@ -4676,12 +4657,14 @@ again:
trans->allocating_chunk = false;
spin_lock(&space_info->lock);
- if (ret < 0 && ret != -ENOSPC)
- goto out;
- if (ret)
- space_info->full = 1;
- else
+ if (ret < 0) {
+ if (ret == -ENOSPC)
+ space_info->full = 1;
+ else
+ goto out;
+ } else {
ret = 1;
+ }
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
@@ -4790,7 +4773,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_roots(fs_info, 0, nr_items);
+ btrfs_start_delalloc_roots(fs_info, nr_items);
if (!current->journal_info)
btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
}
@@ -5570,14 +5553,18 @@ again:
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
- struct btrfs_block_rsv *dest, u64 num_bytes)
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ u64 *qgroup_to_release_ret)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 qgroup_to_release = 0;
u64 ret;
spin_lock(&block_rsv->lock);
- if (num_bytes == (u64)-1)
+ if (num_bytes == (u64)-1) {
num_bytes = block_rsv->size;
+ qgroup_to_release = block_rsv->qgroup_rsv_size;
+ }
block_rsv->size -= num_bytes;
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
@@ -5586,6 +5573,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
} else {
num_bytes = 0;
}
+ if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+ qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+ block_rsv->qgroup_rsv_size;
+ block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+ } else {
+ qgroup_to_release = 0;
+ }
spin_unlock(&block_rsv->lock);
ret = num_bytes;
@@ -5608,6 +5602,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
space_info_add_old_bytes(fs_info, space_info,
num_bytes);
}
+ if (qgroup_to_release_ret)
+ *qgroup_to_release_ret = qgroup_to_release;
return ret;
}
@@ -5749,17 +5745,21 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 num_bytes = 0;
+ u64 qgroup_num_bytes = 0;
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved < block_rsv->size)
num_bytes = block_rsv->size - block_rsv->reserved;
+ if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+ qgroup_num_bytes = block_rsv->qgroup_rsv_size -
+ block_rsv->qgroup_rsv_reserved;
spin_unlock(&block_rsv->lock);
if (num_bytes == 0)
return 0;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
if (ret)
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
@@ -5767,7 +5767,13 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
block_rsv_add_bytes(block_rsv, num_bytes, 0);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), num_bytes, 1);
- }
+
+ /* Don't forget to increase qgroup_rsv_reserved */
+ spin_lock(&block_rsv->lock);
+ block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&block_rsv->lock);
+ } else
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
return ret;
}
@@ -5788,20 +5794,23 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 released = 0;
+ u64 qgroup_to_release = 0;
/*
* Since we statically set the block_rsv->size we just want to say we
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+ released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
+ &qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
if (qgroup_free)
- btrfs_qgroup_free_meta_prealloc(inode->root, released);
+ btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
else
- btrfs_qgroup_convert_reserved_meta(inode->root, released);
+ btrfs_qgroup_convert_reserved_meta(inode->root,
+ qgroup_to_release);
}
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5813,7 +5822,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
if (global_rsv == block_rsv ||
block_rsv->space_info != global_rsv->space_info)
global_rsv = NULL;
- block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
+ block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5893,7 +5902,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
- (u64)-1);
+ (u64)-1, NULL);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5917,48 +5926,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
WARN_ON_ONCE(!list_empty(&trans->new_bgs));
block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
- trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved, NULL);
trans->chunk_bytes_reserved = 0;
}
-/* Can only return 0 or -ENOSPC */
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- /*
- * We always use trans->block_rsv here as we will have reserved space
- * for our orphan when starting the transaction, using get_block_rsv()
- * here will sometimes make us choose the wrong block rsv as we could be
- * doing a reloc inode for a non refcounted root.
- */
- struct btrfs_block_rsv *src_rsv = trans->block_rsv;
- struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
-
- /*
- * We need to hold space in order to delete our orphan item once we've
- * added it, so this takes the reservation so we can release it later
- * when we are truly done with the orphan item.
- */
- u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
- num_bytes, 1);
- return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-}
-
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
- num_bytes, 0);
- btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
-}
-
/*
* btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
* root: the root of the parent directory
@@ -5976,7 +5947,6 @@ void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int items,
- u64 *qgroup_reserved,
bool use_global_rsv)
{
u64 num_bytes;
@@ -5994,8 +5964,6 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
num_bytes = 0;
}
- *qgroup_reserved = num_bytes;
-
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
@@ -6005,8 +5973,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
- if (ret && *qgroup_reserved)
- btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
+ if (ret && num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
return ret;
}
@@ -6022,6 +5990,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
+ u64 qgroup_rsv_size = 0;
u64 csum_leaves;
unsigned outstanding_extents;
@@ -6034,9 +6003,17 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
inode->csum_bytes);
reserve_size += btrfs_calc_trans_metadata_size(fs_info,
csum_leaves);
+ /*
+ * For qgroup rsv, the calculation is very simple:
+ * account one nodesize for each outstanding extent
+ *
+ * This is overestimating in most cases.
+ */
+ qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
spin_lock(&block_rsv->lock);
block_rsv->size = reserve_size;
+ block_rsv->qgroup_rsv_size = qgroup_rsv_size;
spin_unlock(&block_rsv->lock);
}
@@ -6317,6 +6294,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&info->unused_bgs_lock);
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
+ trace_btrfs_add_unused_block_group(cache);
list_add_tail(&cache->bg_list,
&info->unused_bgs);
}
@@ -6474,6 +6452,7 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
int found_type;
int i;
+ int ret = 0;
if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
return 0;
@@ -6490,10 +6469,12 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
continue;
key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- __exclude_logged_extent(fs_info, key.objectid, key.offset);
+ ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void
@@ -7085,7 +7066,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7229,7 +7210,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
out:
if (pin)
- add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
+ add_pinned_bytes(fs_info, buf->len, true,
root->root_key.objectid);
if (last_ref) {
@@ -7283,8 +7264,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
&old_ref_mod, &new_ref_mod);
}
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
+ if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
+ bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+ add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
+ }
return ret;
}
@@ -7336,24 +7320,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return ret;
}
-static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = "raid10",
- [BTRFS_RAID_RAID1] = "raid1",
- [BTRFS_RAID_DUP] = "dup",
- [BTRFS_RAID_RAID0] = "raid0",
- [BTRFS_RAID_SINGLE] = "single",
- [BTRFS_RAID_RAID5] = "raid5",
- [BTRFS_RAID_RAID6] = "raid6",
-};
-
-static const char *get_raid_name(enum btrfs_raid_types type)
-{
- if (type >= BTRFS_NR_RAID_TYPES)
- return NULL;
-
- return btrfs_raid_type_names[type];
-}
-
enum btrfs_loop_type {
LOOP_CACHING_NOWAIT = 0,
LOOP_CACHING_WAIT = 1,
@@ -7625,7 +7591,7 @@ have_block_group:
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(fs_info,
+ trace_btrfs_reserve_extent_cluster(
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
@@ -7698,7 +7664,7 @@ refill_cluster:
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(fs_info,
+ trace_btrfs_reserve_extent_cluster(
block_group, search_start,
num_bytes);
goto checks;
@@ -7798,8 +7764,7 @@ checks:
ins->objectid = search_start;
ins->offset = num_bytes;
- trace_btrfs_reserve_extent(fs_info, block_group,
- search_start, num_bytes);
+ trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
btrfs_release_block_group(block_group, delalloc);
break;
loop:
@@ -8147,8 +8112,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
- ins->offset);
+ ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
if (ret)
return ret;
@@ -8163,37 +8127,52 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- u64 parent, u64 root_objectid,
- u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins)
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_extent_item *extent_item;
+ struct btrfs_key extent_key;
struct btrfs_tree_block_info *block_info;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 num_bytes = ins->offset;
+ u64 num_bytes;
+ u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
- if (!skinny_metadata)
+ ref = btrfs_delayed_node_to_tree_ref(node);
+
+ extent_key.objectid = node->bytenr;
+ if (skinny_metadata) {
+ extent_key.offset = ref->level;
+ extent_key.type = BTRFS_METADATA_ITEM_KEY;
+ num_bytes = fs_info->nodesize;
+ } else {
+ extent_key.offset = node->num_bytes;
+ extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
+ num_bytes = node->num_bytes;
+ }
path = btrfs_alloc_path();
if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+ btrfs_free_and_pin_reserved_extent(fs_info,
+ extent_key.objectid,
fs_info->nodesize);
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- ins, size);
+ &extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+ btrfs_free_and_pin_reserved_extent(fs_info,
+ extent_key.objectid,
fs_info->nodesize);
return ret;
}
@@ -8208,42 +8187,41 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = fs_info->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
- btrfs_set_tree_block_key(leaf, block_info, key);
- btrfs_set_tree_block_level(leaf, block_info, level);
+ btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
+ btrfs_set_tree_block_level(leaf, block_info, ref->level);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
}
- if (parent > 0) {
+ if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
} else {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_TREE_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ret = remove_from_free_space_tree(trans, extent_key.objectid,
num_bytes);
if (ret)
return ret;
- ret = update_block_group(trans, fs_info, ins->objectid,
+ ret = update_block_group(trans, fs_info, extent_key.objectid,
fs_info->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
- ins->objectid, ins->offset);
+ extent_key.objectid, extent_key.offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
+ trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
fs_info->nodesize);
return ret;
}
@@ -8414,7 +8392,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
block_rsv_add_bytes(block_rsv, blocksize, 0);
- block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+ block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
}
/*
@@ -10136,8 +10114,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
} else if (btrfs_block_group_used(&cache->item) == 0) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, info,
- found_key.objectid,
+ add_new_free_space(cache, found_key.objectid,
found_key.objectid +
found_key.offset);
free_excluded_extents(info, cache);
@@ -10167,6 +10144,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
/* Should always be true but just in case. */
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
+ trace_btrfs_add_unused_block_group(cache);
list_add_tail(&cache->bg_list,
&info->unused_bgs);
}
@@ -10232,7 +10210,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
key.offset);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, fs_info, block_group);
+ add_block_group_free_space(trans, block_group);
/* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
@@ -10273,7 +10251,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
- add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
+ add_new_free_space(cache, chunk_offset, chunk_offset + size);
free_excluded_extents(fs_info, cache);
@@ -10354,6 +10332,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
+ trace_btrfs_remove_block_group(block_group);
/*
* Free the reserved super bytes from this block group before
* remove it.
@@ -10611,7 +10590,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->chunk_mutex);
- ret = remove_block_group_free_space(trans, fs_info, block_group);
+ ret = remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
@@ -10718,6 +10697,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* the ro check in case balance is currently acting on
* this block group.
*/
+ trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
up_write(&space_info->groups_sem);
goto next;
@@ -10840,7 +10820,6 @@ next:
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
- struct btrfs_space_info *space_info;
struct btrfs_super_block *disk_super;
u64 features;
u64 flags;
@@ -10856,21 +10835,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = create_space_info(fs_info, flags, &space_info);
+ ret = create_space_info(fs_info, flags);
}
out:
return ret;
@@ -11055,12 +11034,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
- /*
- * Make sure counter is updated before we wake up waiters.
- */
- smp_mb();
- if (waitqueue_active(&root->subv_writers->wait))
- wake_up(&root->subv_writers->wait);
+ cond_wake_up(&root->subv_writers->wait);
}
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 47a8fe9d22e8..51fc015c7d2c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
@@ -25,7 +26,7 @@
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set *btrfs_bioset;
+static struct bio_set btrfs_bioset;
static inline bool extent_state_in_tree(const struct extent_state *state)
{
@@ -161,20 +162,18 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
goto free_state_cache;
- btrfs_bioset = bioset_create(BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
- BIOSET_NEED_BVECS);
- if (!btrfs_bioset)
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_io_bio, bio),
+ BIOSET_NEED_BVECS))
goto free_buffer_cache;
- if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+ if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
goto free_bioset;
return 0;
free_bioset:
- bioset_free(btrfs_bioset);
- btrfs_bioset = NULL;
+ bioset_exit(&btrfs_bioset);
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
@@ -197,8 +196,7 @@ void __cold extent_io_exit(void)
rcu_barrier();
kmem_cache_destroy(extent_state_cache);
kmem_cache_destroy(extent_buffer_cache);
- if (btrfs_bioset)
- bioset_free(btrfs_bioset);
+ bioset_exit(&btrfs_bioset);
}
void extent_io_tree_init(struct extent_io_tree *tree,
@@ -2678,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = first_byte >> 9;
btrfs_io_bio_init(btrfs_io_bio(bio));
@@ -2691,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+ new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
btrfs_bio = btrfs_io_bio(new);
btrfs_io_bio_init(btrfs_bio);
btrfs_bio->iter = bio->bi_iter;
@@ -2703,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
struct bio *bio;
/* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
@@ -2714,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
struct btrfs_io_bio *btrfs_bio;
/* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+ bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
btrfs_bio = btrfs_io_bio(bio);
@@ -3963,11 +3961,11 @@ retry:
done_index = page->index;
/*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
+ * At this point we hold neither the i_pages lock nor
+ * the page lock: the page may be truncated or
+ * invalidated (changing page->mapping to NULL),
+ * or even swizzled back from swapper_space to
+ * tmpfs file mapping
*/
if (!trylock_page(page)) {
flush_write_bio(epd);
@@ -4108,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
return ret;
}
-int extent_writepages(struct extent_io_tree *tree,
- struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
+ .tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4125,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree,
return ret;
}
-int extent_readpages(struct extent_io_tree *tree,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+ unsigned nr_pages)
{
struct bio *bio = NULL;
unsigned page_idx;
@@ -4135,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct page *pagepool[16];
struct page *page;
struct extent_map *em_cached = NULL;
+ struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
@@ -4201,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_map_tree *map,
- struct extent_io_tree *tree,
+static int try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
@@ -4237,13 +4233,13 @@ static int try_release_extent_state(struct extent_map_tree *map,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct extent_map_tree *map,
- struct extent_io_tree *tree, struct page *page,
- gfp_t mask)
+int try_release_extent_mapping(struct page *page, gfp_t mask)
{
struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
+ struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+ struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree;
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
@@ -4277,7 +4273,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
free_extent_map(em);
}
}
- return try_release_extent_state(map, tree, page, mask);
+ return try_release_extent_state(tree, page, mask);
}
/*
@@ -5174,13 +5170,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!PagePrivate(page));
clear_page_dirty_for_io(page);
- spin_lock_irq(&page->mapping->tree_lock);
+ xa_lock_irq(&page->mapping->i_pages);
if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->page_tree,
+ radix_tree_tag_clear(&page->mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&page->mapping->tree_lock);
+ xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
}
@@ -5619,46 +5615,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
-void le_bitmap_set(u8 *map, unsigned int start, int len)
-{
- u8 *p = map + BIT_BYTE(start);
- const unsigned int size = start + len;
- int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
- u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
-
- while (len - bits_to_set >= 0) {
- *p |= mask_to_set;
- len -= bits_to_set;
- bits_to_set = BITS_PER_BYTE;
- mask_to_set = ~0;
- p++;
- }
- if (len) {
- mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
- *p |= mask_to_set;
- }
-}
-
-void le_bitmap_clear(u8 *map, unsigned int start, int len)
-{
- u8 *p = map + BIT_BYTE(start);
- const unsigned int size = start + len;
- int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
- u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
-
- while (len - bits_to_clear >= 0) {
- *p &= ~mask_to_clear;
- len -= bits_to_clear;
- bits_to_clear = BITS_PER_BYTE;
- mask_to_clear = ~0;
- p++;
- }
- if (len) {
- mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
- *p &= ~mask_to_clear;
- }
-}
-
/*
* eb_bitmap_offset() - calculate the page and offset of the byte containing the
* given bit number
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b77d84909863..0bfd4aeb822d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __EXTENTIO__
-#define __EXTENTIO__
+
+#ifndef BTRFS_EXTENT_IO_H
+#define BTRFS_EXTENT_IO_H
#include <linux/rbtree.h>
#include <linux/refcount.h>
@@ -78,14 +79,6 @@
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-static inline int le_test_bit(int nr, const u8 *addr)
-{
- return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
-}
-
-void le_bitmap_set(u8 *map, unsigned int start, int len);
-void le_bitmap_clear(u8 *map, unsigned int start, int len);
-
struct extent_state;
struct btrfs_root;
struct btrfs_inode;
@@ -277,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int create);
void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
-int try_release_extent_mapping(struct extent_map_tree *map,
- struct extent_io_tree *tree, struct page *page,
- gfp_t mask);
+int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
@@ -420,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
-int extent_writepages(struct extent_io_tree *tree,
- struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-int extent_readpages(struct extent_io_tree *tree,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages);
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+ unsigned nr_pages);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
void set_page_extent_mapped(struct page *page);
@@ -572,4 +561,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 53a0633c6ef7..6648d55e5339 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -517,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
/**
* btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @fs_info - used for tracepoint
* @em_tree - the extent tree into which we want to insert the extent mapping
* @em_in - extent we are inserting
* @start - start of the logical range btrfs_get_extent() is requesting
@@ -534,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
@@ -552,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
existing = search_extent_mapping(em_tree, start, len);
- trace_btrfs_handle_em_exist(existing, em, start, len);
+ trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
/*
* existing will always be non-NULL, since there must be
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index f6f8ba114977..25d985e7532a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __EXTENTMAP__
-#define __EXTENTMAP__
+
+#ifndef BTRFS_EXTENT_MAP_H
+#define BTRFS_EXTENT_MAP_H
#include <linux/rbtree.h>
#include <linux/refcount.h>
@@ -91,6 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
+
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index fdcb41002623..f9dd6d1836a3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/bio.h>
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f247300170e5..f660ba1e5e58 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -1761,7 +1748,7 @@ again:
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
- (ret != 0));
+ true);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d0dde9e6afd7..d5f80cb300be 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Red Hat. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/pagemap.h>
@@ -266,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
truncate_pagecache(inode, 0);
/*
- * We don't need an orphan item because truncating the free space cache
- * will never be split across transactions.
- * We don't need to check for -EAGAIN because we're a free space
- * cache inode
+ * We skip the throttling logic for free space cache inodes, so we don't
+ * need to check for -EAGAIN.
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 79eca4cabb1c..15e30b93db0d 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2009 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_FREE_SPACE_CACHE
-#define __BTRFS_FREE_SPACE_CACHE
+#ifndef BTRFS_FREE_SPACE_CACHE_H
+#define BTRFS_FREE_SPACE_CACHE_H
struct btrfs_free_space {
struct rb_node offset_index;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index af36a6a971fe..b5950aacd697 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2015 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/kernel.h>
@@ -25,7 +12,6 @@
#include "transaction.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
@@ -58,11 +44,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
}
static int add_new_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_free_space_info *info;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -151,10 +136,11 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
}
-static u8 *alloc_bitmap(u32 bitmap_size)
+static unsigned long *alloc_bitmap(u32 bitmap_size)
{
- u8 *ret;
+ unsigned long *ret;
unsigned int nofs_flag;
+ u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));
/*
* GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
@@ -165,21 +151,42 @@ static u8 *alloc_bitmap(u32 bitmap_size)
* know that recursion is unsafe.
*/
nofs_flag = memalloc_nofs_save();
- ret = kvzalloc(bitmap_size, GFP_KERNEL);
+ ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
return ret;
}
+static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
+{
+ u8 *p = ((u8 *)map) + BIT_BYTE(start);
+ const unsigned int size = start + len;
+ int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
+ u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
+
+ while (len - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0;
+ p++;
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ *p |= mask_to_set;
+ }
+}
+
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
- u8 *bitmap, *bitmap_cursor;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
u64 start, end;
u64 bitmap_range, i;
u32 bitmap_size, flags, expected_extent_count;
@@ -268,7 +275,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
goto out;
}
- bitmap_cursor = bitmap;
+ bitmap_cursor = (char *)bitmap;
bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
i = start;
while (i < end) {
@@ -309,21 +316,18 @@ out:
}
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->free_space_root;
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
- u8 *bitmap;
+ unsigned long *bitmap;
u64 start, end;
- /* Initialize to silence GCC. */
- u64 extent_start = 0;
- u64 offset;
u32 bitmap_size, flags, expected_extent_count;
- int prev_bit = 0, bit, bitnr;
+ unsigned long nrbits, start_bit, end_bit;
u32 extent_count = 0;
int done = 0, nr;
int ret;
@@ -361,7 +365,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
break;
} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
unsigned long ptr;
- u8 *bitmap_cursor;
+ char *bitmap_cursor;
u32 bitmap_pos, data_size;
ASSERT(found_key.objectid >= start);
@@ -371,7 +375,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
bitmap_pos = div_u64(found_key.objectid - start,
fs_info->sectorsize *
BITS_PER_BYTE);
- bitmap_cursor = bitmap + bitmap_pos;
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
data_size = free_space_bitmap_size(found_key.offset,
fs_info->sectorsize);
@@ -405,32 +409,16 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- offset = start;
- bitnr = 0;
- while (offset < end) {
- bit = !!le_test_bit(bitnr, bitmap);
- if (prev_bit == 0 && bit == 1) {
- extent_start = offset;
- } else if (prev_bit == 1 && bit == 0) {
- key.objectid = extent_start;
- key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = offset - extent_start;
-
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- if (ret)
- goto out;
- btrfs_release_path(path);
+ nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+ start_bit = find_next_bit_le(bitmap, nrbits, 0);
- extent_count++;
- }
- prev_bit = bit;
- offset += fs_info->sectorsize;
- bitnr++;
- }
- if (prev_bit == 1) {
- key.objectid = extent_start;
+ while (start_bit < nrbits) {
+ end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
+ ASSERT(start_bit < end_bit);
+
+ key.objectid = start + start_bit * block_group->fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = end - extent_start;
+ key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
@@ -438,6 +426,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
extent_count++;
+
+ start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
}
if (extent_count != expected_extent_count) {
@@ -459,7 +449,6 @@ out:
}
static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
int new_extents)
@@ -472,7 +461,8 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ info = search_free_space_info(trans, trans->fs_info, block_group, path,
+ 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
goto out;
@@ -487,12 +477,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
- ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
- path);
+ ret = convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
- ret = convert_free_space_to_extents(trans, fs_info, block_group,
- path);
+ ret = convert_free_space_to_extents(trans, block_group, path);
}
out:
@@ -589,12 +577,11 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
* the bitmap.
*/
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = block_group->fs_info->free_space_root;
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
@@ -695,7 +682,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ ret = update_free_space_extent_count(trans, block_group, path,
new_extents);
out:
@@ -703,12 +690,11 @@ out:
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_key key;
u64 found_start, found_end;
u64 end = start + size;
@@ -782,7 +768,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ ret = update_free_space_extent_count(trans, block_group, path,
new_extents);
out:
@@ -790,7 +776,6 @@ out:
}
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
@@ -799,36 +784,35 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
int ret;
if (block_group->needs_free_space) {
- ret = __add_block_group_free_space(trans, fs_info, block_group,
- path);
+ ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
}
- info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ info = search_free_space_info(NULL, trans->fs_info, block_group, path,
+ 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- return modify_free_space_bitmap(trans, fs_info, block_group,
- path, start, size, 1);
+ return modify_free_space_bitmap(trans, block_group, path,
+ start, size, 1);
} else {
- return remove_free_space_extent(trans, fs_info, block_group,
- path, start, size);
+ return remove_free_space_extent(trans, block_group, path,
+ start, size);
}
}
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_path *path;
int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
@@ -837,7 +821,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
goto out;
}
- block_group = btrfs_lookup_block_group(fs_info, start);
+ block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
ASSERT(0);
ret = -ENOENT;
@@ -845,8 +829,8 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
}
mutex_lock(&block_group->free_space_lock);
- ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
- start, size);
+ ret = __remove_from_free_space_tree(trans, block_group, path, start,
+ size);
mutex_unlock(&block_group->free_space_lock);
btrfs_put_block_group(block_group);
@@ -858,12 +842,11 @@ out:
}
static int add_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_key key, new_key;
u64 found_start, found_end;
u64 end = start + size;
@@ -978,7 +961,7 @@ insert:
goto out;
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ ret = update_free_space_extent_count(trans, block_group, path,
new_extents);
out:
@@ -986,17 +969,16 @@ out:
}
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_free_space_info *info;
u32 flags;
int ret;
if (block_group->needs_free_space) {
- ret = __add_block_group_free_space(trans, fs_info, block_group,
- path);
+ ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
}
@@ -1008,23 +990,22 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- return modify_free_space_bitmap(trans, fs_info, block_group,
- path, start, size, 0);
+ return modify_free_space_bitmap(trans, block_group, path,
+ start, size, 0);
} else {
- return add_free_space_extent(trans, fs_info, block_group, path,
- start, size);
+ return add_free_space_extent(trans, block_group, path, start,
+ size);
}
}
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_path *path;
int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
@@ -1033,7 +1014,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
goto out;
}
- block_group = btrfs_lookup_block_group(fs_info, start);
+ block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
ASSERT(0);
ret = -ENOENT;
@@ -1041,8 +1022,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
}
mutex_lock(&block_group->free_space_lock);
- ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
- size);
+ ret = __add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
btrfs_put_block_group(block_group);
@@ -1059,10 +1039,9 @@ out:
* through the normal add/remove hooks.
*/
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = trans->fs_info->extent_root;
struct btrfs_path *path, *path2;
struct btrfs_key key;
u64 start, end;
@@ -1079,7 +1058,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+ ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
goto out;
@@ -1112,7 +1091,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
if (start < key.objectid) {
- ret = __add_to_free_space_tree(trans, fs_info,
+ ret = __add_to_free_space_tree(trans,
block_group,
path2, start,
key.objectid -
@@ -1122,7 +1101,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
}
start = key.objectid;
if (key.type == BTRFS_METADATA_ITEM_KEY)
- start += fs_info->nodesize;
+ start += trans->fs_info->nodesize;
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
@@ -1137,8 +1116,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
}
if (start < end) {
- ret = __add_to_free_space_tree(trans, fs_info, block_group,
- path2, start, end - start);
+ ret = __add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
if (ret)
goto out_locked;
}
@@ -1178,7 +1157,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
while (node) {
block_group = rb_entry(node, struct btrfs_block_group_cache,
cache_node);
- ret = populate_free_space_tree(trans, fs_info, block_group);
+ ret = populate_free_space_tree(trans, block_group);
if (ret)
goto abort;
node = rb_next(node);
@@ -1282,7 +1261,6 @@ abort:
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
@@ -1290,19 +1268,19 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
block_group->needs_free_space = 0;
- ret = add_new_free_space_info(trans, fs_info, block_group, path);
+ ret = add_new_free_space_info(trans, block_group, path);
if (ret)
return ret;
- return __add_to_free_space_tree(trans, fs_info, block_group, path,
+ return __add_to_free_space_tree(trans, block_group, path,
block_group->key.objectid,
block_group->key.offset);
}
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path = NULL;
int ret = 0;
@@ -1319,7 +1297,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
goto out;
}
- ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+ ret = __add_block_group_free_space(trans, block_group, path);
out:
btrfs_free_path(path);
@@ -1330,10 +1308,9 @@ out:
}
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
{
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_path *path;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -1341,7 +1318,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
if (block_group->needs_free_space) {
@@ -1452,7 +1429,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
total_found += add_new_free_space(block_group,
- fs_info,
extent_start,
offset);
if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1466,8 +1442,8 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
}
if (prev_bit == 1) {
- total_found += add_new_free_space(block_group, fs_info,
- extent_start, end);
+ total_found += add_new_free_space(block_group, extent_start,
+ end);
extent_count++;
}
@@ -1524,8 +1500,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
caching_ctl->progress = key.objectid;
- total_found += add_new_free_space(block_group, fs_info,
- key.objectid,
+ total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index ba3787df43c3..3133651d7d70 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2015 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_FREE_SPACE_TREE
-#define __BTRFS_FREE_SPACE_TREE
+#ifndef BTRFS_FREE_SPACE_TREE_H
+#define BTRFS_FREE_SPACE_TREE_H
/*
* The default size for new free space bitmap items. The last bitmap in a block
@@ -32,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size);
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -51,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, int cow);
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size);
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 start, u64 size);
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
int free_space_test_bit(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 1d5631ef2738..a8956a3c9e05 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include "ctree.h"
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9409dcc7020d..12fcd8897c33 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/delay.h>
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
index 6734ec92a1e9..7a962811dffe 100644
--- a/fs/btrfs/inode-map.h
+++ b/fs/btrfs/inode-map.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __BTRFS_INODE_MAP
-#define __BTRFS_INODE_MAP
+
+#ifndef BTRFS_INODE_MAP_H
+#define BTRFS_INODE_MAP_H
void btrfs_init_free_ino_ctl(struct btrfs_root *root);
void btrfs_unpin_free_ino(struct btrfs_root *root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f091c2358a4..89b208201783 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/kernel.h>
@@ -44,6 +31,7 @@
#include <linux/uio.h>
#include <linux/magic.h>
#include <linux/iversion.h>
+#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -1030,8 +1018,10 @@ static noinline int cow_file_range(struct inode *inode,
ram_size, /* ram_bytes */
BTRFS_COMPRESS_NONE, /* compress_type */
BTRFS_ORDERED_REGULAR /* type */);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
goto out_reserve;
+ }
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
@@ -1168,13 +1158,10 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /*
- * atomic_sub_return implies a barrier for waitqueue_active
- */
+ /* atomic_sub_return implies a barrier */
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M &&
- waitqueue_active(&fs_info->async_submit_wait))
- wake_up(&fs_info->async_submit_wait);
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
if (async_cow->inode)
submit_compressed_extents(async_cow->inode, async_cow);
@@ -1385,6 +1372,13 @@ next_slot:
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
goto out_check;
+ /*
+ * Do the same check as in btrfs_cross_ref_exist but
+ * without the unnecessary search.
+ */
+ if (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
if (btrfs_extent_readonly(fs_info, disk_bytenr))
@@ -1754,24 +1748,32 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
spin_unlock(&root->delalloc_lock);
}
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- spin_lock(&root->delalloc_lock);
if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
+ ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
BUG_ON(list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct btrfs_inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ __btrfs_del_delalloc_inode(root, inode);
spin_unlock(&root->delalloc_lock);
}
@@ -3163,6 +3165,9 @@ out:
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
+ /* Try to release some metadata so we don't get an OOM but don't wait */
+ btrfs_btree_balance_dirty_nodelay(fs_info);
+
return ret;
}
@@ -3305,177 +3310,31 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
}
/*
- * This is called in transaction commit time. If there are no orphan
- * files in the subvolume, it removes orphan item and frees block_rsv
- * structure.
- */
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *block_rsv;
- int ret;
-
- if (atomic_read(&root->orphan_inodes) ||
- root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
- return;
-
- spin_lock(&root->orphan_lock);
- if (atomic_read(&root->orphan_inodes)) {
- spin_unlock(&root->orphan_lock);
- return;
- }
-
- if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
- spin_unlock(&root->orphan_lock);
- return;
- }
-
- block_rsv = root->orphan_block_rsv;
- root->orphan_block_rsv = NULL;
- spin_unlock(&root->orphan_lock);
-
- if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
- btrfs_root_refs(&root->root_item) > 0) {
- ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
- root->root_key.objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
- else
- clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
- &root->state);
- }
-
- if (block_rsv) {
- WARN_ON(block_rsv->size > 0);
- btrfs_free_block_rsv(fs_info, block_rsv);
- }
-}
-
-/*
- * This creates an orphan entry for the given inode in case something goes
- * wrong in the middle of an unlink/truncate.
- *
- * NOTE: caller of this function should reserve 5 units of metadata for
- * this function.
+ * This creates an orphan entry for the given inode in case something goes wrong
+ * in the middle of an unlink.
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
- struct btrfs_block_rsv *block_rsv = NULL;
- int reserve = 0;
- bool insert = false;
int ret;
- if (!root->orphan_block_rsv) {
- block_rsv = btrfs_alloc_block_rsv(fs_info,
- BTRFS_BLOCK_RSV_TEMP);
- if (!block_rsv)
- return -ENOMEM;
- }
-
- if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags))
- insert = true;
-
- if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags))
- reserve = 1;
-
- spin_lock(&root->orphan_lock);
- /* If someone has created ->orphan_block_rsv, be happy to use it. */
- if (!root->orphan_block_rsv) {
- root->orphan_block_rsv = block_rsv;
- } else if (block_rsv) {
- btrfs_free_block_rsv(fs_info, block_rsv);
- block_rsv = NULL;
- }
-
- if (insert)
- atomic_inc(&root->orphan_inodes);
- spin_unlock(&root->orphan_lock);
-
- /* grab metadata reservation from transaction handle */
- if (reserve) {
- ret = btrfs_orphan_reserve_metadata(trans, inode);
- ASSERT(!ret);
- if (ret) {
- /*
- * dec doesn't need spin_lock as ->orphan_block_rsv
- * would be released only if ->orphan_inodes is
- * zero.
- */
- atomic_dec(&root->orphan_inodes);
- clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags);
- if (insert)
- clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags);
- return ret;
- }
- }
-
- /* insert an orphan item to track this unlinked/truncated file */
- if (insert) {
- ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
- if (ret) {
- if (reserve) {
- clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags);
- btrfs_orphan_release_metadata(inode);
- }
- /*
- * btrfs_orphan_commit_root may race with us and set
- * ->orphan_block_rsv to zero, in order to avoid that,
- * decrease ->orphan_inodes after everything is done.
- */
- atomic_dec(&root->orphan_inodes);
- if (ret != -EEXIST) {
- clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags);
- btrfs_abort_transaction(trans, ret);
- return ret;
- }
- }
- ret = 0;
+ ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
return 0;
}
/*
- * We have done the truncate/delete so we can go ahead and remove the orphan
- * item for this particular inode.
+ * We have done the delete so we can go ahead and remove the orphan item for
+ * this particular inode.
*/
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
- int delete_item = 0;
- int ret = 0;
-
- if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags))
- delete_item = 1;
-
- if (delete_item && trans)
- ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
- if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags))
- btrfs_orphan_release_metadata(inode);
-
- /*
- * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
- * to zero, in order to avoid that, decrease ->orphan_inodes after
- * everything is done.
- */
- if (delete_item)
- atomic_dec(&root->orphan_inodes);
-
- return ret;
+ return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
}
/*
@@ -3491,7 +3350,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
struct inode *inode;
u64 last_objectid = 0;
- int ret = 0, nr_unlink = 0, nr_truncate = 0;
+ int ret = 0, nr_unlink = 0;
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
return 0;
@@ -3591,12 +3450,31 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = found_key.objectid - 1;
continue;
}
+
}
+
/*
- * Inode is already gone but the orphan item is still there,
- * kill the orphan item.
+ * If we have an inode with links, there are a couple of
+ * possibilities. Old kernels (before v3.12) used to create an
+ * orphan item for truncate indicating that there were possibly
+ * extent items past i_size that needed to be deleted. In v3.12,
+ * truncate was changed to update i_size in sync with the extent
+ * items, but the (useless) orphan item was still created. Since
+ * v4.18, we don't create the orphan item for truncate at all.
+ *
+ * So, this item could mean that we need to do a truncate, but
+ * only if this filesystem was last used on a pre-v3.12 kernel
+ * and was not cleanly unmounted. The odds of that are quite
+ * slim, and it's a pain to do the truncate now, so just delete
+ * the orphan item.
+ *
+ * It's also possible that this orphan item was supposed to be
+ * deleted but wasn't. The inode number may have been reused,
+ * but either way, we can delete the orphan item.
*/
- if (ret == -ENOENT) {
+ if (ret == -ENOENT || inode->i_nlink) {
+ if (!ret)
+ iput(inode);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3612,42 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
- /*
- * add this inode to the orphan list so btrfs_orphan_del does
- * the proper thing when we hit it
- */
- set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &BTRFS_I(inode)->runtime_flags);
- atomic_inc(&root->orphan_inodes);
-
- /* if we have links, this was a truncate, lets do that */
- if (inode->i_nlink) {
- if (WARN_ON(!S_ISREG(inode->i_mode))) {
- iput(inode);
- continue;
- }
- nr_truncate++;
-
- /* 1 for the orphan item deletion. */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- iput(inode);
- ret = PTR_ERR(trans);
- goto out;
- }
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- btrfs_end_transaction(trans);
- if (ret) {
- iput(inode);
- goto out;
- }
-
- ret = btrfs_truncate(inode, false);
- if (ret)
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- } else {
- nr_unlink++;
- }
+ nr_unlink++;
/* this will do delete_inode and everything for us */
iput(inode);
@@ -3659,12 +3502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
- if (root->orphan_block_rsv)
- btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
- (u64)-1);
-
- if (root->orphan_block_rsv ||
- test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans);
@@ -3672,8 +3510,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (nr_unlink)
btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
- if (nr_truncate)
- btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
out:
if (ret)
@@ -3936,7 +3772,7 @@ cache_acl:
break;
}
- btrfs_update_iflags(inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
return 0;
make_bad:
@@ -4250,7 +4086,7 @@ out:
return ret;
}
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len)
@@ -4331,6 +4167,262 @@ out:
return ret;
}
+/*
+ * Helper to check if the subvolume references other subvolumes or if it's
+ * default.
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 dir_id;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Make sure this root isn't set as the default subvol */
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
+ dir_id, "default", 7, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.objectid == root->root_key.objectid) {
+ ret = -EPERM;
+ btrfs_err(fs_info,
+ "deleting default subvolume %llu is not allowed",
+ key.objectid);
+ goto out;
+ }
+ btrfs_release_path(path);
+ }
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/* Delete all dentries for inodes belonging to the root */
+static void btrfs_prune_dentries(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+ node = node->rb_left;
+ else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+}
+
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *dest = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
+ int ret;
+ int err;
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the inode lock so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ if (dest->send_in_progress == 0) {
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ return -EPERM;
+ }
+
+ down_write(&fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * One for dir inode,
+ * two for dir entries,
+ * two for root ref/backref.
+ */
+ err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_release;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ if (ret) {
+ err = ret;
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+ ret = btrfs_insert_orphan_item(trans,
+ fs_info->tree_root,
+ dest->root_key.objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ }
+
+ ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_remove(trans,
+ dest->root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ }
+
+out_end_trans:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+ ret = btrfs_end_transaction(trans);
+ if (ret && !err)
+ err = ret;
+ inode->i_flags |= S_DEAD;
+out_release:
+ btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (err) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ d_invalidate(dentry);
+ btrfs_prune_dentries(dest);
+ ASSERT(dest->send_in_progress == 0);
+
+ /* the last ref */
+ if (dest->ino_cache_inode) {
+ iput(dest->ino_cache_inode);
+ dest->ino_cache_inode = NULL;
+ }
+ }
+
+ return err;
+}
+
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
@@ -4342,7 +4434,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
- return -EPERM;
+ return btrfs_delete_subvolume(dir, dentry);
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4454,7 +4546,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int pending_del_slot = 0;
int extent_type = -1;
int ret;
- int err = 0;
u64 ino = btrfs_ino(BTRFS_I(inode));
u64 bytes_deleted = 0;
bool be_nice = false;
@@ -4506,22 +4597,19 @@ search_again:
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > SZ_32M) {
- if (btrfs_should_end_transaction(trans)) {
- err = -EAGAIN;
- goto error;
- }
+ if (be_nice && bytes_deleted > SZ_32M &&
+ btrfs_should_end_transaction(trans)) {
+ ret = -EAGAIN;
+ goto out;
}
-
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0) {
+ ret = 0;
/* there are no items in the tree for us to truncate, we're
* done
*/
@@ -4632,7 +4720,7 @@ search_again:
* We have to bail so the last_size is set to
* just before this extent.
*/
- err = NEED_TRUNCATE_BLOCK;
+ ret = NEED_TRUNCATE_BLOCK;
break;
}
@@ -4671,7 +4759,10 @@ delete:
extent_num_bytes, 0,
btrfs_header_owner(leaf),
ino, extent_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
if (btrfs_should_throttle_delayed_refs(trans, fs_info))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
@@ -4699,7 +4790,7 @@ delete:
pending_del_nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto error;
+ break;
}
pending_del_nr = 0;
}
@@ -4710,8 +4801,8 @@ delete:
trans->delayed_ref_updates = 0;
ret = btrfs_run_delayed_refs(trans,
updates * 2);
- if (ret && !err)
- err = ret;
+ if (ret)
+ break;
}
}
/*
@@ -4719,8 +4810,8 @@ delete:
* and let the transaction restart
*/
if (should_end) {
- err = -EAGAIN;
- goto error;
+ ret = -EAGAIN;
+ break;
}
goto search_again;
} else {
@@ -4728,32 +4819,37 @@ delete:
}
}
out:
- if (pending_del_nr) {
- ret = btrfs_del_items(trans, root, path, pending_del_slot,
+ if (ret >= 0 && pending_del_nr) {
+ int err;
+
+ err = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ ret = err;
+ }
}
-error:
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ASSERT(last_size >= new_size);
- if (!err && last_size > new_size)
+ if (!ret && last_size > new_size)
last_size = new_size;
btrfs_ordered_update_i_size(inode, last_size, NULL);
}
btrfs_free_path(path);
- if (be_nice && bytes_deleted > SZ_32M) {
+ if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
unsigned long updates = trans->delayed_ref_updates;
+ int err;
+
if (updates) {
trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans, updates * 2);
- if (ret && !err)
- err = ret;
+ err = btrfs_run_delayed_refs(trans, updates * 2);
+ if (err)
+ ret = err;
}
}
- return err;
+ return ret;
}
/*
@@ -5095,30 +5191,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags);
- /*
- * 1 for the orphan item we're going to add
- * 1 for the orphan item deletion.
- */
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * We need to do this in case we fail at _any_ point during the
- * actual truncate. Once we do the truncate_setsize we could
- * invalidate pages which forces any outstanding ordered io to
- * be instantly completed which will give us extents that need
- * to be truncated. If we fail to get an orphan inode down we
- * could have left over extents that were never meant to live,
- * so we need to guarantee from this point on that everything
- * will be consistent.
- */
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- btrfs_end_transaction(trans);
- if (ret)
- return ret;
-
- /* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
/* Disable nonlocked read DIO to avoid the end less truncate */
@@ -5130,29 +5202,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (ret && inode->i_nlink) {
int err;
- /* To get a stable disk_i_size */
- err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
- if (err) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- return err;
- }
-
/*
- * failed to truncate, disk_i_size is only adjusted down
- * as we remove extents, so it should represent the true
- * size of the inode, so reset the in memory size and
- * delete our orphan entry.
+ * Truncate failed, so fix up the in-memory size. We
+ * adjusted disk_i_size down as we removed extents, so
+ * wait for disk_i_size to be stable and then update the
+ * in-memory size to match.
*/
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- return ret;
- }
- i_size_write(inode, BTRFS_I(inode)->disk_i_size);
- err = btrfs_orphan_del(trans, BTRFS_I(inode));
+ err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (err)
- btrfs_abort_transaction(trans, err);
- btrfs_end_transaction(trans);
+ return err;
+ i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
@@ -5282,13 +5341,52 @@ static void evict_inode_truncate_pages(struct inode *inode)
spin_unlock(&io_tree->lock);
}
+static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 min_size)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ int failures = 0;
+
+ for (;;) {
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+
+ if (ret && ++failures > 2) {
+ btrfs_warn(fs_info,
+ "could not allocate space for a delete; will truncate on mount");
+ return ERR_PTR(-ENOSPC);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans) || !ret)
+ return trans;
+
+ /*
+ * Try to steal from the global reserve if there is space for
+ * it.
+ */
+ if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+ return trans;
+
+ /* If not, commit and try again. */
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+}
+
void btrfs_evict_inode(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv, *global_rsv;
- int steal_from_global = 0;
+ struct btrfs_block_rsv *rsv;
u64 min_size;
int ret;
@@ -5309,21 +5407,16 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto no_delete;
- if (is_bad_inode(inode)) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ if (is_bad_inode(inode))
goto no_delete;
- }
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
if (!special_file(inode->i_mode))
btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
- BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &BTRFS_I(inode)->runtime_flags));
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
goto no_delete;
- }
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
@@ -5332,130 +5425,63 @@ void btrfs_evict_inode(struct inode *inode)
}
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
- if (ret) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ if (ret)
goto no_delete;
- }
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ if (!rsv)
goto no_delete;
- }
rsv->size = min_size;
rsv->failfast = 1;
- global_rsv = &fs_info->global_block_rsv;
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * This is a bit simpler than btrfs_truncate since we've already
- * reserved our space for our orphan item in the unlink, so we just
- * need to reserve some slack space in case we add bytes and update
- * inode item when doing the truncate.
- */
while (1) {
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
- BTRFS_RESERVE_FLUSH_LIMIT);
-
- /*
- * Try and steal from the global reserve since we will
- * likely not use this space anyway, we want to try as
- * hard as possible to get this to work.
- */
- if (ret)
- steal_from_global++;
- else
- steal_from_global = 0;
- ret = 0;
-
- /*
- * steal_from_global == 0: we reserved stuff, hooray!
- * steal_from_global == 1: we didn't reserve stuff, boo!
- * steal_from_global == 2: we've committed, still not a lot of
- * room but maybe we'll have room in the global reserve this
- * time.
- * steal_from_global == 3: abandon all hope!
- */
- if (steal_from_global > 2) {
- btrfs_warn(fs_info,
- "Could not get space for a delete, will truncate on mount %d",
- ret);
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- btrfs_free_block_rsv(fs_info, rsv);
- goto no_delete;
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- btrfs_free_block_rsv(fs_info, rsv);
- goto no_delete;
- }
-
- /*
- * We can't just steal from the global reserve, we need to make
- * sure there is room to do it, if not we need to commit and try
- * again.
- */
- if (steal_from_global) {
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
- ret = btrfs_block_rsv_migrate(global_rsv, rsv,
- min_size, 0);
- else
- ret = -ENOSPC;
- }
-
- /*
- * Couldn't steal from the global reserve, we have too much
- * pending stuff built up, commit the transaction and try it
- * again.
- */
- if (ret) {
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
- btrfs_free_block_rsv(fs_info, rsv);
- goto no_delete;
- }
- continue;
- } else {
- steal_from_global = 0;
- }
+ trans = evict_refill_and_join(root, rsv, min_size);
+ if (IS_ERR(trans))
+ goto free_rsv;
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- if (ret != -ENOSPC && ret != -EAGAIN)
- break;
-
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
- trans = NULL;
btrfs_btree_balance_dirty(fs_info);
+ if (ret && ret != -ENOSPC && ret != -EAGAIN)
+ goto free_rsv;
+ else if (!ret)
+ break;
}
- btrfs_free_block_rsv(fs_info, rsv);
-
/*
- * Errors here aren't a big deal, it just means we leave orphan items
- * in the tree. They will be cleaned up on the next mount.
+ * Errors here aren't a big deal, it just means we leave orphan items in
+ * the tree. They will be cleaned up on the next mount. If the inode
+ * number gets reused, cleanup deletes the orphan item without doing
+ * anything, and unlink reuses the existing orphan item.
+ *
+ * If it turns out that we are dropping too many of these, we might want
+ * to add a mechanism for retrying these after a commit.
*/
- if (ret == 0) {
- trans->block_rsv = root->orphan_block_rsv;
+ trans = evict_refill_and_join(root, rsv, min_size);
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
- } else {
- btrfs_orphan_del(NULL, BTRFS_I(inode));
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ btrfs_end_transaction(trans);
}
- trans->block_rsv = &fs_info->trans_block_rsv;
if (!(root == fs_info->tree_root ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(fs_info);
+free_rsv:
+ btrfs_free_block_rsv(fs_info, rsv);
no_delete:
+ /*
+ * If we didn't successfully delete, the orphan item will still be in
+ * the tree and we'll retry on the next mount. Again, we might also want
+ * to retry these periodically in the future.
+ */
btrfs_remove_delayed_node(BTRFS_I(inode));
clear_inode(inode);
}
@@ -5631,69 +5657,6 @@ static void inode_tree_del(struct inode *inode)
}
}
-void btrfs_invalidate_inodes(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
-
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- WARN_ON(btrfs_root_refs(&root->root_item) != 0);
-
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
- node = node->rb_left;
- else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from
- * the inode cache when its usage count
- * hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
- }
- spin_unlock(&root->inode_lock);
-}
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
@@ -5855,11 +5818,6 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
return 0;
}
-static void btrfs_dentry_release(struct dentry *dentry)
-{
- kfree(dentry->d_fsdata);
-}
-
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -5918,11 +5876,13 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
struct dir_entry *entry = addr;
char *name = (char *)(entry + 1);
- ctx->pos = entry->offset;
- if (!dir_emit(ctx, name, entry->name_len, entry->ino,
- entry->type))
+ ctx->pos = get_unaligned(&entry->offset);
+ if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
+ get_unaligned(&entry->ino),
+ get_unaligned(&entry->type)))
return 1;
- addr += sizeof(struct dir_entry) + entry->name_len;
+ addr += sizeof(struct dir_entry) +
+ get_unaligned(&entry->name_len);
ctx->pos++;
}
return 0;
@@ -6012,14 +5972,15 @@ again:
}
entry = addr;
- entry->name_len = name_len;
+ put_unaligned(name_len, &entry->name_len);
name_ptr = (char *)(entry + 1);
read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
name_len);
- entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+ put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
+ &entry->type);
btrfs_dir_item_key_to_cpu(leaf, di, &location);
- entry->ino = location.objectid;
- entry->offset = found_key.offset;
+ put_unaligned(location.objectid, &entry->ino);
+ put_unaligned(found_key.offset, &entry->offset);
entries++;
addr += sizeof(struct dir_entry) + name_len;
total_len += sizeof(struct dir_entry) + name_len;
@@ -6272,7 +6233,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_update_iflags(inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -6588,8 +6549,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
goto out_unlock_inode;
} else {
btrfs_update_inode(trans, root, inode);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
}
out_unlock:
@@ -6665,8 +6625,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
goto out_unlock_inode;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans);
@@ -6709,8 +6668,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
* 2 items for inode and inode ref
* 2 items for dir items
* 1 item for parent inode
+ * 1 item for orphan item deletion if O_TMPFILE
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
trans = NULL;
@@ -6811,12 +6771,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (err)
goto out_fail_inode;
- d_instantiate(dentry, inode);
- /*
- * mkdir is special. We're unlocking after we call d_instantiate
- * to avoid a race with nfsd calling d_instantiate.
- */
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
drop_on_err = 0;
out_fail:
@@ -7092,7 +7047,7 @@ insert:
err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
@@ -7377,6 +7332,14 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
btrfs_file_extent_other_encoding(leaf, fi))
goto out;
+ /*
+ * Do the same check as in btrfs_cross_ref_exist but without the
+ * unnecessary search.
+ */
+ if (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
backref_offset = btrfs_file_extent_offset(leaf, fi);
if (orig_start) {
@@ -7577,6 +7540,125 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
return em;
}
+
+static int btrfs_get_blocks_direct_read(struct extent_map *em,
+ struct buffer_head *bh_result,
+ struct inode *inode,
+ u64 start, u64 len)
+{
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return -ENOENT;
+
+ len = min(len, em->len - (start - em->start));
+
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+
+ return 0;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct buffer_head *bh_result,
+ struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *em = *map;
+ int ret = 0;
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes) == 1 &&
+ btrfs_inc_nocow_writers(fs_info, block_start)) {
+ struct extent_map *em2;
+
+ em2 = btrfs_create_dio_extent(inode, start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(fs_info, block_start);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em = em2;
+ }
+
+ if (em2 && IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+ /*
+ * For inode marked NODATACOW or extent marked PREALLOC,
+ * use the existing or preallocated extent, so does not
+ * need to adjust btrfs_space_info's bytes_may_use.
+ */
+ btrfs_free_reserved_data_space_noquota(inode, start,
+ len);
+ goto skip_cow;
+ }
+ }
+
+ /* this will cow the extent */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ *map = em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ len = min(len, em->len - (start - em->start));
+
+skip_cow:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (!dio_data->overwrite && start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+
+ WARN_ON(dio_data->reserve < len);
+ dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
+ current->journal_info = dio_data;
+out:
+ return ret;
+}
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -7645,116 +7727,36 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- /* Just a good old fashioned hole, return */
- if (!create && (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- free_extent_map(em);
- goto unlock_err;
- }
-
- /*
- * We don't allocate a new extent in the following cases
- *
- * 1) The inode is marked as NODATACOW. In this case we'll just use the
- * existing extent.
- * 2) The extent is marked as PREALLOC. We're good to go here and can
- * just use the extent.
- *
- */
- if (!create) {
- len = min(len, em->len - (start - em->start));
- lockstart = start + len;
- goto unlock;
- }
-
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
- ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- em->block_start != EXTENT_MAP_HOLE)) {
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
-
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- type = BTRFS_ORDERED_PREALLOC;
- else
- type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->len - (start - em->start));
- block_start = em->block_start + (start - em->start);
-
- if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start)) {
- struct extent_map *em2;
-
- em2 = btrfs_create_dio_extent(inode, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(fs_info, block_start);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- em = em2;
- }
- if (em2 && IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto unlock_err;
- }
- /*
- * For inode marked NODATACOW or extent marked PREALLOC,
- * use the existing or preallocated extent, so does not
- * need to adjust btrfs_space_info's bytes_may_use.
- */
- btrfs_free_reserved_data_space_noquota(inode,
- start, len);
- goto unlock;
- }
- }
-
- /*
- * this will cow the extent, reset the len in case we changed
- * it above
- */
- len = bh_result->b_size;
- free_extent_map(em);
- em = btrfs_new_extent_direct(inode, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
- len = min(len, em->len - (start - em->start));
-unlock:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
- set_buffer_mapped(bh_result);
if (create) {
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
+ ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
+ dio_data, start, len);
+ if (ret < 0)
+ goto unlock_err;
+ /* clear and unlock the entire range */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, 1, 0, &cached_state);
+ } else {
+ ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+ start, len);
+ /* Can be negative only if we read from a hole */
+ if (ret < 0) {
+ ret = 0;
+ free_extent_map(em);
+ goto unlock_err;
+ }
/*
- * Need to update the i_size under the extent lock so buffered
- * readers will get the updated i_size when we unlock.
+ * We need to unlock only the end area that we aren't using.
+ * The rest is going to be unlocked by the endio routine.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
- i_size_write(inode, start + len);
-
- WARN_ON(dio_data->reserve < len);
- dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
- }
-
- /*
- * In the case of write we need to clear and unlock the entire range,
- * in the case of read we need to unlock only the end area that we
- * aren't using if there is any left over space.
- */
- if (lockstart < lockend) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, unlock_bits, 1, 0,
- &cached_state);
- } else {
- free_extent_state(cached_state);
+ lockstart = start + bh_result->b_size;
+ if (lockstart < lockend) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state);
+ } else {
+ free_extent_state(cached_state);
+ }
}
free_extent_map(em);
@@ -8140,7 +8142,6 @@ static void __endio_write_update_ordered(struct inode *inode,
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- int ret;
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
wq = fs_info->endio_freespace_worker;
@@ -8150,32 +8151,31 @@ static void __endio_write_update_ordered(struct inode *inode,
func = btrfs_endio_write_helper;
}
-again:
- last_offset = ordered_offset;
- ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate);
- if (!ret)
- goto out_test;
-
- btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered->work);
-out_test:
- /*
- * If btrfs_dec_test_ordered_pending does not find any ordered extent
- * in the range, we can exit.
- */
- if (ordered_offset == last_offset)
- return;
- /*
- * our bio might span multiple ordered extents. If we haven't
- * completed the accounting for the whole dio, go back and try again
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- goto again;
+ while (ordered_offset < offset + bytes) {
+ last_offset = ordered_offset;
+ if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
+ &ordered_offset,
+ ordered_bytes,
+ uptodate)) {
+ btrfs_init_work(&ordered->work, func,
+ finish_ordered_fn,
+ NULL, NULL);
+ btrfs_queue_work(wq, &ordered->work);
+ }
+ /*
+ * If btrfs_dec_test_ordered_pending does not find any ordered
+ * extent in the range, we can exit.
+ */
+ if (ordered_offset == last_offset)
+ return;
+ /*
+ * Our bio might span multiple ordered extents. In this case
+ * we keep goin until we have accounted the whole dio.
+ */
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
+ ordered = NULL;
+ }
}
}
@@ -8714,29 +8714,19 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
static int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree;
-
- tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_writepages(tree, mapping, wbc);
+ return extent_writepages(mapping, wbc);
}
static int
btrfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(mapping->host)->io_tree;
- return extent_readpages(tree, mapping, pages, nr_pages);
+ return extent_readpages(mapping, pages, nr_pages);
}
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct extent_io_tree *tree;
- struct extent_map_tree *map;
- int ret;
-
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- map = &BTRFS_I(page->mapping->host)->extent_tree;
- ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+ int ret = try_release_extent_mapping(page, gfp_flags);
if (ret == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
@@ -8877,8 +8867,8 @@ again:
*
* We are not allowed to take the i_mutex here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
- * vmtruncate() writes the inode size before removing pages, once we have the
- * page lock we can determine safely if the page is beyond EOF. If it is not
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*/
@@ -9040,8 +9030,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- int ret = 0;
- int err = 0;
+ int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
@@ -9054,39 +9043,31 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
/*
- * Yes ladies and gentlemen, this is indeed ugly. The fact is we have
- * 3 things going on here
- *
- * 1) We need to reserve space for our orphan item and the space to
- * delete our orphan item. Lord knows we don't want to have a dangling
- * orphan item because we didn't reserve space to remove it.
+ * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
+ * things going on here:
*
- * 2) We need to reserve space to update our inode.
+ * 1) We need to reserve space to update our inode.
*
- * 3) We need to have something to cache all the space that is going to
+ * 2) We need to have something to cache all the space that is going to
* be free'd up by the truncate operation, but also have some slack
* space reserved in case it uses space during the truncate (thank you
* very much snapshotting).
*
- * And we need these to all be separate. The fact is we can use a lot of
+ * And we need these to be separate. The fact is we can use a lot of
* space doing the truncate, and we have no earthly idea how much space
* we will use, so we need the truncate reservation to be separate so it
- * doesn't end up using space reserved for updating the inode or
- * removing the orphan item. We also need to be able to stop the
- * transaction and start a new one, which means we need to be able to
- * update the inode several times, and we have no idea of knowing how
- * many times that will be, so we can't just reserve 1 item for the
- * entirety of the operation, so that has to be done separately as well.
- * Then there is the orphan item, which does indeed need to be held on
- * to for the whole operation, and we need nobody to touch this reserved
- * space except the orphan code.
+ * doesn't end up using space reserved for updating the inode. We also
+ * need to be able to stop the transaction and start a new one, which
+ * means we need to be able to update the inode several times, and we
+ * have no idea of knowing how many times that will be, so we can't just
+ * reserve 1 item for the entirety of the operation, so that has to be
+ * done separately as well.
*
* So that leaves us with
*
- * 1) root->orphan_block_rsv - for the orphan deletion.
- * 2) rsv - for the truncate reservation, which we will steal from the
+ * 1) rsv - for the truncate reservation, which we will steal from the
* transaction reservation.
- * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+ * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
@@ -9101,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
*/
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out;
}
@@ -9125,23 +9106,19 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
trans->block_rsv = &fs_info->trans_block_rsv;
- if (ret != -ENOSPC && ret != -EAGAIN) {
- err = ret;
+ if (ret != -ENOSPC && ret != -EAGAIN)
break;
- }
ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- err = ret;
+ if (ret)
break;
- }
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
- ret = err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
break;
}
@@ -9174,29 +9151,23 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
}
- if (ret == 0 && inode->i_nlink > 0) {
- trans->block_rsv = root->orphan_block_rsv;
- ret = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (ret)
- err = ret;
- }
-
if (trans) {
+ int ret2;
+
trans->block_rsv = &fs_info->trans_block_rsv;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && !err)
- err = ret;
+ ret2 = btrfs_update_inode(trans, root, inode);
+ if (ret2 && !ret)
+ ret = ret2;
- ret = btrfs_end_transaction(trans);
+ ret2 = btrfs_end_transaction(trans);
+ if (ret2 && !ret)
+ ret = ret2;
btrfs_btree_balance_dirty(fs_info);
}
out:
btrfs_free_block_rsv(fs_info, rsv);
- if (ret && !err)
- err = ret;
-
- return err;
+ return ret;
}
/*
@@ -9332,13 +9303,6 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root)
goto free;
- if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &BTRFS_I(inode)->runtime_flags)) {
- btrfs_info(fs_info, "inode %llu still on the orphan list",
- btrfs_ino(BTRFS_I(inode)));
- atomic_dec(&root->orphan_inodes);
- }
-
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
@@ -9972,6 +9936,13 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
@@ -9985,15 +9956,11 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (delalloc_work->delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ iput(inode);
complete(&delalloc_work->completion);
}
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int delay_iput)
+static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
{
struct btrfs_delalloc_work *work;
@@ -10004,7 +9971,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
@@ -10012,18 +9978,11 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
return work;
}
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
-{
- wait_for_completion(&work->completion);
- kfree(work);
-}
-
/*
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
- int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -10051,12 +10010,9 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode);
if (!work) {
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
+ iput(inode);
ret = -ENOMEM;
goto out;
}
@@ -10074,10 +10030,11 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
+ wait_for_completion(&work->completion);
+ kfree(work);
}
- if (!list_empty_careful(&splice)) {
+ if (!list_empty(&splice)) {
spin_lock(&root->delalloc_lock);
list_splice_tail(&splice, &root->delalloc_inodes);
spin_unlock(&root->delalloc_lock);
@@ -10086,7 +10043,7 @@ out:
return ret;
}
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -10094,14 +10051,13 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = __start_delalloc_inodes(root, delay_iput, -1);
+ ret = start_delalloc_inodes(root, -1);
if (ret > 0)
ret = 0;
return ret;
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
- int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -10124,7 +10080,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = __start_delalloc_inodes(root, delay_iput, nr);
+ ret = start_delalloc_inodes(root, nr);
btrfs_put_fs_root(root);
if (ret < 0)
goto out;
@@ -10139,7 +10095,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
ret = 0;
out:
- if (!list_empty_careful(&splice)) {
+ if (!list_empty(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
list_splice_tail(&splice, &fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
@@ -10259,8 +10215,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_unlock_inode;
}
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans);
@@ -10678,5 +10633,4 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
const struct dentry_operations btrfs_dentry_operations = {
.d_delete = btrfs_dentry_delete,
- .d_release = btrfs_dentry_release,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2db3988813f..d29992f7dc63 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/kernel.h>
@@ -106,20 +93,22 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
+ unsigned int flags)
{
- if (S_ISDIR(mode))
+ if (S_ISDIR(inode->i_mode))
return flags;
- else if (S_ISREG(mode))
+ else if (S_ISREG(inode->i_mode))
return flags & ~FS_DIRSYNC_FL;
else
return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}
/*
- * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
+ * ioctl.
*/
-static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
{
unsigned int iflags = 0;
@@ -149,20 +138,20 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
-void btrfs_update_iflags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
{
- struct btrfs_inode *ip = BTRFS_I(inode);
+ struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
- if (ip->flags & BTRFS_INODE_SYNC)
+ if (binode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
- if (ip->flags & BTRFS_INODE_IMMUTABLE)
+ if (binode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
- if (ip->flags & BTRFS_INODE_APPEND)
+ if (binode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
- if (ip->flags & BTRFS_INODE_NOATIME)
+ if (binode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
- if (ip->flags & BTRFS_INODE_DIRSYNC)
+ if (binode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
set_mask_bits(&inode->i_flags,
@@ -172,15 +161,16 @@ void btrfs_update_iflags(struct inode *inode)
static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
{
- struct btrfs_inode *ip = BTRFS_I(file_inode(file));
- unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+ struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+ unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
if (copy_to_user(arg, &flags, sizeof(flags)))
return -EFAULT;
return 0;
}
-static int check_flags(unsigned int flags)
+/* Check if @flags are a supported and valid set of FS_*_FL flags */
+static int check_fsflags(unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -199,13 +189,13 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_inode *ip = BTRFS_I(inode);
- struct btrfs_root *root = ip->root;
+ struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
- unsigned int flags, oldflags;
+ unsigned int fsflags, old_fsflags;
int ret;
- u64 ip_oldflags;
- unsigned int i_oldflags;
+ u64 old_flags;
+ unsigned int old_i_flags;
umode_t mode;
if (!inode_owner_or_capable(inode))
@@ -214,10 +204,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (btrfs_root_readonly(root))
return -EROFS;
- if (copy_from_user(&flags, arg, sizeof(flags)))
+ if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
- ret = check_flags(flags);
+ ret = check_fsflags(fsflags);
if (ret)
return ret;
@@ -227,44 +217,44 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode_lock(inode);
- ip_oldflags = ip->flags;
- i_oldflags = inode->i_flags;
+ old_flags = binode->flags;
+ old_i_flags = inode->i_flags;
mode = inode->i_mode;
- flags = btrfs_mask_flags(inode->i_mode, flags);
- oldflags = btrfs_flags_to_ioctl(ip->flags);
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+ if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
ret = -EPERM;
goto out_unlock;
}
}
- if (flags & FS_SYNC_FL)
- ip->flags |= BTRFS_INODE_SYNC;
+ if (fsflags & FS_SYNC_FL)
+ binode->flags |= BTRFS_INODE_SYNC;
else
- ip->flags &= ~BTRFS_INODE_SYNC;
- if (flags & FS_IMMUTABLE_FL)
- ip->flags |= BTRFS_INODE_IMMUTABLE;
+ binode->flags &= ~BTRFS_INODE_SYNC;
+ if (fsflags & FS_IMMUTABLE_FL)
+ binode->flags |= BTRFS_INODE_IMMUTABLE;
else
- ip->flags &= ~BTRFS_INODE_IMMUTABLE;
- if (flags & FS_APPEND_FL)
- ip->flags |= BTRFS_INODE_APPEND;
+ binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (fsflags & FS_APPEND_FL)
+ binode->flags |= BTRFS_INODE_APPEND;
else
- ip->flags &= ~BTRFS_INODE_APPEND;
- if (flags & FS_NODUMP_FL)
- ip->flags |= BTRFS_INODE_NODUMP;
+ binode->flags &= ~BTRFS_INODE_APPEND;
+ if (fsflags & FS_NODUMP_FL)
+ binode->flags |= BTRFS_INODE_NODUMP;
else
- ip->flags &= ~BTRFS_INODE_NODUMP;
- if (flags & FS_NOATIME_FL)
- ip->flags |= BTRFS_INODE_NOATIME;
+ binode->flags &= ~BTRFS_INODE_NODUMP;
+ if (fsflags & FS_NOATIME_FL)
+ binode->flags |= BTRFS_INODE_NOATIME;
else
- ip->flags &= ~BTRFS_INODE_NOATIME;
- if (flags & FS_DIRSYNC_FL)
- ip->flags |= BTRFS_INODE_DIRSYNC;
+ binode->flags &= ~BTRFS_INODE_NOATIME;
+ if (fsflags & FS_DIRSYNC_FL)
+ binode->flags |= BTRFS_INODE_DIRSYNC;
else
- ip->flags &= ~BTRFS_INODE_DIRSYNC;
- if (flags & FS_NOCOW_FL) {
+ binode->flags &= ~BTRFS_INODE_DIRSYNC;
+ if (fsflags & FS_NOCOW_FL) {
if (S_ISREG(mode)) {
/*
* It's safe to turn csums off here, no extents exist.
@@ -272,10 +262,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* status of the file and will not set it.
*/
if (inode->i_size == 0)
- ip->flags |= BTRFS_INODE_NODATACOW
- | BTRFS_INODE_NODATASUM;
+ binode->flags |= BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM;
} else {
- ip->flags |= BTRFS_INODE_NODATACOW;
+ binode->flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
@@ -283,10 +273,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
*/
if (S_ISREG(mode)) {
if (inode->i_size == 0)
- ip->flags &= ~(BTRFS_INODE_NODATACOW
+ binode->flags &= ~(BTRFS_INODE_NODATACOW
| BTRFS_INODE_NODATASUM);
} else {
- ip->flags &= ~BTRFS_INODE_NODATACOW;
+ binode->flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -295,18 +285,18 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* flag may be changed automatically if compression code won't make
* things smaller.
*/
- if (flags & FS_NOCOMP_FL) {
- ip->flags &= ~BTRFS_INODE_COMPRESS;
- ip->flags |= BTRFS_INODE_NOCOMPRESS;
+ if (fsflags & FS_NOCOMP_FL) {
+ binode->flags &= ~BTRFS_INODE_COMPRESS;
+ binode->flags |= BTRFS_INODE_NOCOMPRESS;
ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA)
goto out_drop;
- } else if (flags & FS_COMPR_FL) {
+ } else if (fsflags & FS_COMPR_FL) {
const char *comp;
- ip->flags |= BTRFS_INODE_COMPRESS;
- ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ binode->flags |= BTRFS_INODE_COMPRESS;
+ binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
@@ -321,7 +311,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA)
goto out_drop;
- ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
trans = btrfs_start_transaction(root, 1);
@@ -330,7 +320,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
}
- btrfs_update_iflags(inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
@@ -338,8 +328,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_end_transaction(trans);
out_drop:
if (ret) {
- ip->flags = ip_oldflags;
- inode->i_flags = i_oldflags;
+ binode->flags = old_flags;
+ inode->i_flags = old_i_flags;
}
out_unlock:
@@ -348,6 +338,148 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
return ret;
}
+/*
+ * Translate btrfs internal inode flags to xflags as expected by the
+ * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
+ * silently dropped.
+ */
+static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
+{
+ unsigned int xflags = 0;
+
+ if (flags & BTRFS_INODE_APPEND)
+ xflags |= FS_XFLAG_APPEND;
+ if (flags & BTRFS_INODE_IMMUTABLE)
+ xflags |= FS_XFLAG_IMMUTABLE;
+ if (flags & BTRFS_INODE_NOATIME)
+ xflags |= FS_XFLAG_NOATIME;
+ if (flags & BTRFS_INODE_NODUMP)
+ xflags |= FS_XFLAG_NODUMP;
+ if (flags & BTRFS_INODE_SYNC)
+ xflags |= FS_XFLAG_SYNC;
+
+ return xflags;
+}
+
+/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
+static int check_xflags(unsigned int flags)
+{
+ if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
+ FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+/*
+ * Set the xflags from the internal inode flags. The remaining items of fsxattr
+ * are zeroed.
+ */
+static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
+{
+ struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+ struct fsxattr fa;
+
+ memset(&fa, 0, sizeof(fa));
+ fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
+
+ if (copy_to_user(arg, &fa, sizeof(fa)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_root *root = binode->root;
+ struct btrfs_trans_handle *trans;
+ struct fsxattr fa;
+ unsigned old_flags;
+ unsigned old_i_flags;
+ int ret = 0;
+
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ memset(&fa, 0, sizeof(fa));
+ if (copy_from_user(&fa, arg, sizeof(fa)))
+ return -EFAULT;
+
+ ret = check_xflags(fa.fsx_xflags);
+ if (ret)
+ return ret;
+
+ if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
+ return -EOPNOTSUPP;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ old_flags = binode->flags;
+ old_i_flags = inode->i_flags;
+
+ /* We need the capabilities to change append-only or immutable inode */
+ if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
+ (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
+ !capable(CAP_LINUX_IMMUTABLE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (fa.fsx_xflags & FS_XFLAG_SYNC)
+ binode->flags |= BTRFS_INODE_SYNC;
+ else
+ binode->flags &= ~BTRFS_INODE_SYNC;
+ if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
+ binode->flags |= BTRFS_INODE_IMMUTABLE;
+ else
+ binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (fa.fsx_xflags & FS_XFLAG_APPEND)
+ binode->flags |= BTRFS_INODE_APPEND;
+ else
+ binode->flags &= ~BTRFS_INODE_APPEND;
+ if (fa.fsx_xflags & FS_XFLAG_NODUMP)
+ binode->flags |= BTRFS_INODE_NODUMP;
+ else
+ binode->flags &= ~BTRFS_INODE_NODUMP;
+ if (fa.fsx_xflags & FS_XFLAG_NOATIME)
+ binode->flags |= BTRFS_INODE_NOATIME;
+ else
+ binode->flags &= ~BTRFS_INODE_NOATIME;
+
+ /* 1 item for the inode */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ btrfs_sync_inode_flags_to_i_flags(inode);
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+
+ btrfs_end_transaction(trans);
+
+out_unlock:
+ if (ret) {
+ binode->flags = old_flags;
+ inode->i_flags = old_i_flags;
+ }
+
+ inode_unlock(inode);
+ mnt_drop_write_file(file);
+
+ return ret;
+}
+
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
struct inode *inode = file_inode(file);
@@ -437,7 +569,6 @@ static noinline int create_subvol(struct inode *dir,
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
- u64 qgroup_reserved;
uuid_le new_uuid;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
@@ -462,8 +593,7 @@ static noinline int create_subvol(struct inode *dir,
* The same as the snapshot creation, please see the comment
* of create_snapshot().
*/
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
- 8, &qgroup_reserved, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
if (ret)
goto fail_free;
@@ -586,7 +716,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
BUG_ON(ret);
- ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -653,7 +783,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
wait_event(root->subv_writers->wait,
percpu_counter_sum(&root->subv_writers->counter) == 0);
- ret = btrfs_start_delalloc_inodes(root, 0);
+ ret = btrfs_start_delalloc_inodes(root);
if (ret)
goto dec_and_free;
@@ -671,7 +801,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
*/
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv, 8,
- &pending_snapshot->qgroup_reserved,
false);
if (ret)
goto dec_and_free;
@@ -1470,7 +1599,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -1578,7 +1706,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
@@ -1845,60 +1972,6 @@ out:
return ret;
}
-/*
- * helper to check if the subvolume references other subvolumes
- */
-static noinline int may_destroy_subvol(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- struct btrfs_dir_item *di;
- struct btrfs_key key;
- u64 dir_id;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /* Make sure this root isn't set as the default subvol */
- dir_id = btrfs_super_root_dir(fs_info->super_copy);
- di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
- dir_id, "default", 7, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
- ret = -EPERM;
- btrfs_err(fs_info,
- "deleting default subvolume %llu is not allowed",
- key.objectid);
- goto out;
- }
- btrfs_release_path(path);
- }
-
- key.objectid = root->root_key.objectid;
- key.type = BTRFS_ROOT_REF_KEY;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- BUG_ON(ret == 0);
-
- ret = 0;
- if (path->slots[0] > 0) {
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
- ret = -ENOTEMPTY;
- }
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static noinline int key_in_sk(struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk)
{
@@ -2079,7 +2152,7 @@ static noinline int search_ioctl(struct inode *inode,
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
btrfs_free_path(path);
- return -ENOENT;
+ return PTR_ERR(root);
}
}
@@ -2213,8 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- btrfs_err(info, "could not find root %llu", tree_id);
- ret = -ENOENT;
+ ret = PTR_ERR(root);
goto out;
}
@@ -2269,6 +2341,165 @@ out:
return ret;
}
+static int btrfs_search_path_in_tree_user(struct inode *inode,
+ struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct super_block *sb = inode->i_sb;
+ struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+ u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+ u64 dirid = args->dirid;
+ unsigned long item_off;
+ unsigned long item_len;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_root_ref *rref;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key, key2;
+ struct extent_buffer *leaf;
+ struct inode *temp_inode;
+ char *ptr;
+ int slot;
+ int len;
+ int total_len = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * If the bottom subvolume does not exist directly under upper_limit,
+ * construct the path in from the bottom up.
+ */
+ if (dirid != upper_limit.objectid) {
+ ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+ key.objectid = treeid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(leaf, iref);
+ ptr -= len + 1;
+ total_len += len + 1;
+ if (ptr < args->path) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+
+ *(ptr + len) = '/';
+ read_extent_buffer(leaf, ptr,
+ (unsigned long)(iref + 1), len);
+
+ /* Check the read+exec permission of this directory */
+ ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_ITEM_KEY);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key2, slot);
+ if (key2.objectid != dirid) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ temp_inode = btrfs_iget(sb, &key2, root, NULL);
+ ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+ iput(temp_inode);
+ if (ret) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ if (key.offset == upper_limit.objectid)
+ break;
+ if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ key.objectid = key.offset;
+ key.offset = (u64)-1;
+ dirid = key.objectid;
+ }
+
+ memmove(args->path, ptr, total_len);
+ args->path[total_len] = '\0';
+ btrfs_release_path(path);
+ }
+
+ /* Get the bottom subvolume's name from ROOT_REF */
+ root = fs_info->tree_root;
+ key.objectid = treeid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = args->treeid;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot);
+ item_len = btrfs_item_size_nr(leaf, slot);
+ /* Check if dirid in ROOT_REF corresponds to passed dirid */
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Copy subvolume's name */
+ item_off += sizeof(struct btrfs_root_ref);
+ item_len -= sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, args->name, item_off, item_len);
+ args->name[item_len] = 0;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static noinline int btrfs_ioctl_ino_lookup(struct file *file,
void __user *argp)
{
@@ -2311,6 +2542,265 @@ out:
return ret;
}
+/*
+ * Version of ino_lookup ioctl (unprivileged)
+ *
+ * The main differences from ino_lookup ioctl are:
+ *
+ * 1. Read + Exec permission will be checked using inode_permission() during
+ * path construction. -EACCES will be returned in case of failure.
+ * 2. Path construction will be stopped at the inode number which corresponds
+ * to the fd with which this ioctl is called. If constructed path does not
+ * exist under fd's inode, -EACCES will be returned.
+ * 3. The name of bottom subvolume is also searched and filled.
+ */
+static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_ino_lookup_user_args *args;
+ struct inode *inode;
+ int ret;
+
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ inode = file_inode(file);
+
+ if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
+ BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * The subvolume does not exist under fd with which this is
+ * called
+ */
+ kfree(args);
+ return -EACCES;
+ }
+
+ ret = btrfs_search_path_in_tree_user(inode, args);
+
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+
+ kfree(args);
+ return ret;
+}
+
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root_item *root_item;
+ struct btrfs_root_ref *rref;
+ struct extent_buffer *leaf;
+ unsigned long item_off;
+ unsigned long item_len;
+ struct inode *inode;
+ int slot;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+ if (!subvol_info) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ inode = file_inode(file);
+ fs_info = BTRFS_I(inode)->root->fs_info;
+
+ /* Get root_item of inode's subvolume */
+ key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ root_item = &root->root_item;
+
+ subvol_info->treeid = key.objectid;
+
+ subvol_info->generation = btrfs_root_generation(root_item);
+ subvol_info->flags = btrfs_root_flags(root_item);
+
+ memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
+ memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
+ BTRFS_UUID_SIZE);
+ memcpy(subvol_info->received_uuid, root_item->received_uuid,
+ BTRFS_UUID_SIZE);
+
+ subvol_info->ctransid = btrfs_root_ctransid(root_item);
+ subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
+ subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
+
+ subvol_info->otransid = btrfs_root_otransid(root_item);
+ subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
+ subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
+
+ subvol_info->stransid = btrfs_root_stransid(root_item);
+ subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
+ subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
+
+ subvol_info->rtransid = btrfs_root_rtransid(root_item);
+ subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
+ subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
+
+ if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* Search root tree for ROOT_BACKREF of this subvolume */
+ root = fs_info->tree_root;
+
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == subvol_info->treeid &&
+ key.type == BTRFS_ROOT_BACKREF_KEY) {
+ subvol_info->parent_id = key.offset;
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot)
+ + sizeof(struct btrfs_root_ref);
+ item_len = btrfs_item_size_nr(leaf, slot)
+ - sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, subvol_info->name,
+ item_off, item_len);
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+ ret = -EFAULT;
+
+out:
+ btrfs_free_path(path);
+ kzfree(subvol_info);
+ return ret;
+}
+
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+ struct btrfs_root_ref *rref;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct inode *inode;
+ u64 objectid;
+ int slot;
+ int ret;
+ u8 found;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ rootrefs = memdup_user(argp, sizeof(*rootrefs));
+ if (IS_ERR(rootrefs)) {
+ btrfs_free_path(path);
+ return PTR_ERR(rootrefs);
+ }
+
+ inode = file_inode(file);
+ root = BTRFS_I(inode)->root->fs_info->tree_root;
+ objectid = BTRFS_I(inode)->root->root_key.objectid;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = rootrefs->min_treeid;
+ found = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ rootrefs->rootref[found].treeid = key.offset;
+ rootrefs->rootref[found].dirid =
+ btrfs_root_ref_dirid(leaf, rref);
+ found++;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+out:
+ if (!ret || ret == -EOVERFLOW) {
+ rootrefs->num_items = found;
+ /* update min_treeid for next search */
+ if (found)
+ rootrefs->min_treeid =
+ rootrefs->rootref[found - 1].treeid + 1;
+ if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+ ret = -EFAULT;
+ }
+
+ kfree(rootrefs);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
@@ -2322,12 +2812,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_trans_handle *trans;
- struct btrfs_block_rsv block_rsv;
- u64 root_flags;
- u64 qgroup_reserved;
int namelen;
- int ret;
int err = 0;
if (!S_ISDIR(dir->i_mode))
@@ -2411,133 +2896,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
inode_lock(inode);
-
- /*
- * Don't allow to delete a subvolume with send in progress. This is
- * inside the i_mutex so the error handling that has to drop the bit
- * again is not run concurrently.
- */
- spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- if (dest->send_in_progress == 0) {
- btrfs_set_root_flags(&dest->root_item,
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- } else {
- spin_unlock(&dest->root_item_lock);
- btrfs_warn(fs_info,
- "Attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
- err = -EPERM;
- goto out_unlock_inode;
- }
-
- down_write(&fs_info->subvol_sem);
-
- err = may_destroy_subvol(dest);
- if (err)
- goto out_up_write;
-
- btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * One for dir inode, two for dir entries, two for root
- * ref/backref.
- */
- err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
- 5, &qgroup_reserved, true);
- if (err)
- goto out_up_write;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out_release;
- }
- trans->block_rsv = &block_rsv;
- trans->bytes_reserved = block_rsv.size;
-
- btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
-
- ret = btrfs_unlink_subvol(trans, root, dir,
- dest->root_key.objectid,
- dentry->d_name.name,
- dentry->d_name.len);
- if (ret) {
- err = ret;
- btrfs_abort_transaction(trans, ret);
- goto out_end_trans;
- }
-
- btrfs_record_root_in_trans(trans, dest);
-
- memset(&dest->root_item.drop_progress, 0,
- sizeof(dest->root_item.drop_progress));
- dest->root_item.drop_level = 0;
- btrfs_set_root_refs(&dest->root_item, 0);
-
- if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
- ret = btrfs_insert_orphan_item(trans,
- fs_info->tree_root,
- dest->root_key.objectid);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_end_trans;
- }
- }
-
- ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
- if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_end_trans;
- }
- if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
- ret = btrfs_uuid_tree_rem(trans, fs_info,
- dest->root_item.received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
- if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_end_trans;
- }
- }
-
-out_end_trans:
- trans->block_rsv = NULL;
- trans->bytes_reserved = 0;
- ret = btrfs_end_transaction(trans);
- if (ret && !err)
- err = ret;
- inode->i_flags |= S_DEAD;
-out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
- if (err) {
- spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- btrfs_set_root_flags(&dest->root_item,
- root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- }
-out_unlock_inode:
+ err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err) {
- d_invalidate(dentry);
- btrfs_invalidate_inodes(dest);
+ if (!err)
d_delete(dentry);
- ASSERT(dest->send_in_progress == 0);
- /* the last ref */
- if (dest->ino_cache_inode) {
- iput(dest->ino_cache_inode);
- dest->ino_cache_inode = NULL;
- }
- }
out_dput:
dput(dentry);
out_unlock_dir:
@@ -2626,7 +2989,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2641,7 +3003,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- mutex_unlock(&fs_info->volume_mutex);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
@@ -2667,8 +3028,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
/* Check for compatibility reject unknown flags */
- if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
- return -EOPNOTSUPP;
+ if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2967,8 +3330,6 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
put_page(pg);
}
}
- kfree(cmp->src_pages);
- kfree(cmp->dst_pages);
}
static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
@@ -2977,40 +3338,14 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
{
int ret;
int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
- struct page **src_pgarr, **dst_pgarr;
- /*
- * We must gather up all the pages before we initiate our
- * extent locking. We use an array for the page pointers. Size
- * of the array is bounded by len, which is in turn bounded by
- * BTRFS_MAX_DEDUPE_LEN.
- */
- src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
- dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
- if (!src_pgarr || !dst_pgarr) {
- kfree(src_pgarr);
- kfree(dst_pgarr);
- return -ENOMEM;
- }
cmp->num_pages = num_pages;
- cmp->src_pages = src_pgarr;
- cmp->dst_pages = dst_pgarr;
-
- /*
- * If deduping ranges in the same inode, locking rules make it mandatory
- * to always lock pages in ascending order to avoid deadlocks with
- * concurrent tasks (such as starting writeback/delalloc).
- */
- if (src == dst && dst_loff < loff) {
- swap(src_pgarr, dst_pgarr);
- swap(loff, dst_loff);
- }
- ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
+ ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
if (ret)
goto out;
- ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
+ ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
out:
if (ret)
@@ -3080,31 +3415,23 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
return 0;
}
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff,
+ struct cmp_pages *cmp)
{
int ret;
u64 len = olen;
- struct cmp_pages cmp;
bool same_inode = (src == dst);
u64 same_lock_start = 0;
u64 same_lock_len = 0;
- if (len == 0)
- return 0;
-
- if (same_inode)
- inode_lock(src);
- else
- btrfs_double_inode_lock(src, dst);
-
ret = extent_same_check_offsets(src, loff, &len, olen);
if (ret)
- goto out_unlock;
+ return ret;
ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
if (ret)
- goto out_unlock;
+ return ret;
if (same_inode) {
/*
@@ -3121,32 +3448,21 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
* allow an unaligned length so long as it ends at
* i_size.
*/
- if (len != olen) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (len != olen)
+ return -EINVAL;
/* Check for overlapping ranges */
- if (dst_loff + len > loff && dst_loff < loff + len) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (dst_loff + len > loff && dst_loff < loff + len)
+ return -EINVAL;
same_lock_start = min_t(u64, loff, dst_loff);
same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
}
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
again:
- ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+ ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
if (ret)
- goto out_unlock;
+ return ret;
if (same_inode)
ret = lock_extent_range(src, same_lock_start, same_lock_len,
@@ -3167,7 +3483,7 @@ again:
* Ranges in the io trees already unlocked. Now unlock all
* pages before waiting for all IO to complete.
*/
- btrfs_cmp_data_free(&cmp);
+ btrfs_cmp_data_free(cmp);
if (same_inode) {
btrfs_wait_ordered_range(src, same_lock_start,
same_lock_len);
@@ -3180,12 +3496,12 @@ again:
ASSERT(ret == 0);
if (WARN_ON(ret)) {
/* ranges in the io trees already unlocked */
- btrfs_cmp_data_free(&cmp);
+ btrfs_cmp_data_free(cmp);
return ret;
}
/* pass original length for comparison so we stay within i_size */
- ret = btrfs_cmp_data(olen, &cmp);
+ ret = btrfs_cmp_data(olen, cmp);
if (ret == 0)
ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
@@ -3195,18 +3511,91 @@ again:
else
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
- btrfs_cmp_data_free(&cmp);
+ btrfs_cmp_data_free(cmp);
+
+ return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret;
+ struct cmp_pages cmp;
+ int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
+ bool same_inode = (src == dst);
+ u64 i, tail_len, chunk_count;
+
+ if (olen == 0)
+ return 0;
+
+ if (same_inode)
+ inode_lock(src);
+ else
+ btrfs_double_inode_lock(src, dst);
+
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+ chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+ if (chunk_count == 0)
+ num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
+
+ /*
+ * If deduping ranges in the same inode, locking rules make it
+ * mandatory to always lock pages in ascending order to avoid deadlocks
+ * with concurrent tasks (such as starting writeback/delalloc).
+ */
+ if (same_inode && dst_loff < loff)
+ swap(loff, dst_loff);
+
+ /*
+ * We must gather up all the pages before we initiate our extent
+ * locking. We use an array for the page pointers. Size of the array is
+ * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
+ */
+ cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_ZERO);
+ cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!cmp.src_pages || !cmp.dst_pages) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ for (i = 0; i < chunk_count; i++) {
+ ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+ dst, dst_loff, &cmp);
+ if (ret)
+ goto out_unlock;
+
+ loff += BTRFS_MAX_DEDUPE_LEN;
+ dst_loff += BTRFS_MAX_DEDUPE_LEN;
+ }
+
+ if (tail_len > 0)
+ ret = btrfs_extent_same_range(src, loff, tail_len, dst,
+ dst_loff, &cmp);
+
out_unlock:
if (same_inode)
inode_unlock(src);
else
btrfs_double_inode_unlock(src, dst);
+out_free:
+ kvfree(cmp.src_pages);
+ kvfree(cmp.dst_pages);
+
return ret;
}
-#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-
ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
struct file *dst_file, u64 dst_loff)
{
@@ -3215,9 +3604,6 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
ssize_t res;
- if (olen > BTRFS_MAX_DEDUPE_LEN)
- olen = BTRFS_MAX_DEDUPE_LEN;
-
if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
@@ -3839,11 +4225,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
src->i_sb != inode->i_sb)
return -EXDEV;
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
return -EISDIR;
@@ -3853,6 +4234,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
inode_lock(src);
}
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* determine range to clone */
ret = -EINVAL;
if (off + len > src->i_size || off + len < off)
@@ -4020,8 +4408,8 @@ out:
return ret;
}
-void btrfs_get_block_group_info(struct list_head *groups_list,
- struct btrfs_ioctl_space_info *space)
+static void get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
{
struct btrfs_block_group_cache *block_group;
@@ -4137,8 +4525,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c])) {
- btrfs_get_block_group_info(
- &info->block_groups[c], &space);
+ get_block_group_info(&info->block_groups[c],
+ &space);
memcpy(dest, &space, sizeof(space));
dest++;
space_args.total_spaces++;
@@ -4503,14 +4891,14 @@ out_loi:
return ret;
}
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
bargs->flags = bctl->flags;
- if (atomic_read(&fs_info->balance_running))
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
if (atomic_read(&fs_info->balance_pause_req))
bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
@@ -4521,13 +4909,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
- if (lock) {
- spin_lock(&fs_info->balance_lock);
- memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
- spin_unlock(&fs_info->balance_lock);
- } else {
- memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
- }
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
}
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
@@ -4548,7 +4932,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
again:
if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -4563,21 +4946,22 @@ again:
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
/* this is either (2) or (3) */
- if (!atomic_read(&fs_info->balance_running)) {
+ if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
- if (!mutex_trylock(&fs_info->volume_mutex))
- goto again;
+ /*
+ * Lock released to allow other waiters to continue,
+ * we'll reexamine the status again.
+ */
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl &&
- !atomic_read(&fs_info->balance_running)) {
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
/* this is (3) */
need_unlock = false;
goto locked;
}
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
goto again;
} else {
/* this is (2) */
@@ -4630,7 +5014,6 @@ locked:
goto out_bargs;
}
- bctl->fs_info = fs_info;
if (arg) {
memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
@@ -4649,14 +5032,14 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
- * goes to to btrfs_balance. bctl is freed in __cancel_balance,
- * or, if restriper was paused all the way until unmount, in
- * free_fs_info. The flag is cleared in __cancel_balance.
+ * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
+ * btrfs_balance. bctl is freed in reset_balance_state, or, if
+ * restriper was paused all the way until unmount, in free_fs_info.
+ * The flag should be cleared after reset_balance_state.
*/
need_unlock = false;
- ret = btrfs_balance(bctl, bargs);
+ ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
if (arg) {
@@ -4670,7 +5053,6 @@ out_bargs:
kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
if (need_unlock)
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
@@ -4714,7 +5096,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
goto out;
}
- update_ioctl_balance_args(fs_info, 1, bargs);
+ btrfs_update_ioctl_balance_args(fs_info, bargs);
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
@@ -5051,8 +5433,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
BTRFS_UUID_SIZE);
if (received_uuid_changed &&
!btrfs_is_empty_uuid(root_item->received_uuid)) {
- ret = btrfs_uuid_tree_rem(trans, fs_info,
- root_item->received_uuid,
+ ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret && ret != -ENOENT) {
@@ -5076,7 +5457,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
+ ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
@@ -5510,7 +5891,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
@@ -5578,6 +5959,16 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(file, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
+ case FS_IOC_FSGETXATTR:
+ return btrfs_ioctl_fsgetxattr(file, argp);
+ case FS_IOC_FSSETXATTR:
+ return btrfs_ioctl_fssetxattr(file, argp);
+ case BTRFS_IOC_GET_SUBVOL_INFO:
+ return btrfs_ioctl_get_subvol_info(file, argp);
+ case BTRFS_IOC_GET_SUBVOL_ROOTREF:
+ return btrfs_ioctl_get_subvol_rootref(file, argp);
+ case BTRFS_IOC_INO_LOOKUP_USER:
+ return btrfs_ioctl_ino_lookup_user(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 621083f8932c..1da768e5ef75 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
@@ -78,22 +66,16 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
- /*
- * atomic_dec_and_test implies a barrier for waitqueue_active
- */
- if (atomic_dec_and_test(&eb->blocking_writers) &&
- waitqueue_active(&eb->write_lock_wq))
- wake_up(&eb->write_lock_wq);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ cond_wake_up_nomb(&eb->write_lock_wq);
} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
- /*
- * atomic_dec_and_test implies a barrier for waitqueue_active
- */
- if (atomic_dec_and_test(&eb->blocking_readers) &&
- waitqueue_active(&eb->read_lock_wq))
- wake_up(&eb->read_lock_wq);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
}
}
@@ -233,12 +215,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
- /*
- * atomic_dec_and_test implies a barrier for waitqueue_active
- */
- if (atomic_dec_and_test(&eb->blocking_readers) &&
- waitqueue_active(&eb->read_lock_wq))
- wake_up(&eb->read_lock_wq);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
atomic_dec(&eb->read_locks);
}
@@ -287,12 +266,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
- /*
- * Make sure counter is updated before we wake up waiters.
- */
+ /* Use the lighter barrier after atomic */
smp_mb__after_atomic();
- if (waitqueue_active(&eb->write_lock_wq))
- wake_up(&eb->write_lock_wq);
+ cond_wake_up_nomb(&eb->write_lock_wq);
} else {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index c44a9d5f5362..29135def468e 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_LOCKING_
-#define __BTRFS_LOCKING_
+#ifndef BTRFS_LOCKING_H
+#define BTRFS_LOCKING_H
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1c7f7f70caf4..b6a4cc178bee 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/kernel.h>
@@ -30,6 +17,43 @@
#define LZO_LEN 4
+/*
+ * Btrfs LZO compression format
+ *
+ * Regular and inlined LZO compressed data extents consist of:
+ *
+ * 1. Header
+ * Fixed size. LZO_LEN (4) bytes long, LE32.
+ * Records the total size (including the header) of compressed data.
+ *
+ * 2. Segment(s)
+ * Variable size. Each segment includes one segment header, followd by data
+ * payload.
+ * One regular LZO compressed extent can have one or more segments.
+ * For inlined LZO compressed extent, only one segment is allowed.
+ * One segment represents at most one page of uncompressed data.
+ *
+ * 2.1 Segment header
+ * Fixed size. LZO_LEN (4) bytes long, LE32.
+ * Records the total size of the segment (not including the header).
+ * Segment header never crosses page boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the page.
+ *
+ * 2.2 Data Payload
+ * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
+ * which is 4419 for a 4KiB page.
+ *
+ * Example:
+ * Page 1:
+ * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
+ * 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
+ * ...
+ * 0x0ff0 | SegHdr N | Data payload N ... |00|
+ * ^^ padding zeros
+ * Page 2:
+ * 0x1000 | SegHdr N+1| Data payload N+1 ... |
+ */
+
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -271,6 +295,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long working_bytes;
size_t in_len;
size_t out_len;
+ const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
unsigned long in_offset;
unsigned long in_page_bytes_left;
unsigned long tot_in;
@@ -284,10 +309,22 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
+ /*
+ * Compressed data header check.
+ *
+ * The real compressed size can't exceed the maximum extent length, and
+ * all pages should be used (whole unused page with just the segment
+ * header is not possible). If this happens it means the compressed
+ * extent is corrupted.
+ */
+ if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
+ tot_len < srclen - PAGE_SIZE) {
+ ret = -EUCLEAN;
+ goto done;
+ }
tot_in = LZO_LEN;
in_offset = LZO_LEN;
- tot_len = min_t(size_t, srclen, tot_len);
in_page_bytes_left = PAGE_SIZE - LZO_LEN;
tot_out = 0;
@@ -298,6 +335,17 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
in_offset += LZO_LEN;
tot_in += LZO_LEN;
+ /*
+ * Segment header check.
+ *
+ * The segment length must not exceed the maximum LZO
+ * compression size, nor the total compressed size.
+ */
+ if (in_len > max_segment_len || tot_in + in_len > tot_len) {
+ ret = -EUCLEAN;
+ goto done;
+ }
+
tot_in += in_len;
working_bytes = in_len;
may_late_unmap = need_unmap = false;
@@ -348,7 +396,7 @@ cont:
}
}
- out_len = lzo1x_worst_compress(PAGE_SIZE);
+ out_len = max_segment_len;
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
if (need_unmap)
@@ -382,15 +430,24 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
+ size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
int ret = 0;
char *kaddr;
unsigned long bytes;
- BUG_ON(srclen < LZO_LEN);
+ if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+ return -EUCLEAN;
+ in_len = read_compress_length(data_in);
+ if (in_len != srclen)
+ return -EUCLEAN;
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
+ if (in_len != srclen - LZO_LEN * 2) {
+ ret = -EUCLEAN;
+ goto out;
+ }
data_in += LZO_LEN;
out_len = PAGE_SIZE;
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index 1b10a3cd1195..75246f2f56ba 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -1,25 +1,11 @@
-
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2012 Fujitsu. All rights reserved.
* Written by Miao Xie <miaox@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_MATH_H
-#define __BTRFS_MATH_H
+#ifndef BTRFS_MATH_H
+#define BTRFS_MATH_H
#include <asm/div64.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 661cc3db0c7c..2e1a1694a33d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/slab.h>
@@ -356,11 +343,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /*
- * Implicit memory barrier after test_and_set_bit
- */
- if (waitqueue_active(&entry->wait))
- wake_up(&entry->wait);
+ /* test_and_set_bit implies a barrier */
+ cond_wake_up_nomb(&entry->wait);
} else {
ret = 1;
}
@@ -423,11 +407,8 @@ have_entry:
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /*
- * Implicit memory barrier after test_and_set_bit
- */
- if (waitqueue_active(&entry->wait))
- wake_up(&entry->wait);
+ /* test_and_set_bit implies a barrier */
+ cond_wake_up_nomb(&entry->wait);
} else {
ret = 1;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4a1672a13ba6..3be443fb3001 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_ORDERED_DATA__
-#define __BTRFS_ORDERED_DATA__
+#ifndef BTRFS_ORDERED_DATA_H
+#define BTRFS_ORDERED_DATA_H
/* one of these per inode */
struct btrfs_ordered_inode_tree {
@@ -218,4 +205,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
+
#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 47767d5b8f0b..aa534108c1e2 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Red Hat. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include "ctree.h"
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 4a8770485f77..a4e11cf04671 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include "ctree.h"
@@ -179,6 +166,25 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
}
}
+/*
+ * Helper to output refs and locking status of extent buffer. Useful to debug
+ * race condition related problems.
+ */
+static void print_eb_refs_lock(struct extent_buffer *eb)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ btrfs_info(eb->fs_info,
+"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
+ atomic_read(&eb->refs), atomic_read(&eb->write_locks),
+ atomic_read(&eb->read_locks),
+ atomic_read(&eb->blocking_writers),
+ atomic_read(&eb->blocking_readers),
+ atomic_read(&eb->spinning_writers),
+ atomic_read(&eb->spinning_readers),
+ eb->lock_owner, current->pid);
+#endif
+}
+
void btrfs_print_leaf(struct extent_buffer *l)
{
struct btrfs_fs_info *fs_info;
@@ -202,9 +208,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
fs_info = l->fs_info;
nr = btrfs_header_nritems(l);
- btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d",
- btrfs_header_bytenr(l), nr,
- btrfs_leaf_free_space(fs_info, l));
+ btrfs_info(fs_info,
+ "leaf %llu gen %llu total ptrs %d free space %d owner %llu",
+ btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
+ btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l));
+ print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
@@ -338,7 +346,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
}
}
-void btrfs_print_tree(struct extent_buffer *c)
+void btrfs_print_tree(struct extent_buffer *c, bool follow)
{
struct btrfs_fs_info *fs_info;
int i; u32 nr;
@@ -355,15 +363,20 @@ void btrfs_print_tree(struct extent_buffer *c)
return;
}
btrfs_info(fs_info,
- "node %llu level %d total ptrs %d free spc %u",
- btrfs_header_bytenr(c), level, nr,
- (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr);
+ "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu",
+ btrfs_header_bytenr(c), level, btrfs_header_generation(c),
+ nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr,
+ btrfs_header_owner(c));
+ print_eb_refs_lock(c);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
- pr_info("\tkey %d (%llu %u %llu) block %llu\n",
+ pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
i, key.objectid, key.type, key.offset,
- btrfs_node_blockptr(c, i));
+ btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i));
}
+ if (!follow)
+ return;
for (i = 0; i < nr; i++) {
struct btrfs_key first_key;
struct extent_buffer *next;
@@ -385,7 +398,7 @@ void btrfs_print_tree(struct extent_buffer *c)
if (btrfs_header_level(next) !=
level - 1)
BUG();
- btrfs_print_tree(next);
+ btrfs_print_tree(next, follow);
free_extent_buffer(next);
}
}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 3afd508ed8c5..e6bb38fd75ad 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,23 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __PRINT_TREE_
-#define __PRINT_TREE_
+#ifndef BTRFS_PRINT_TREE_H
+#define BTRFS_PRINT_TREE_H
+
void btrfs_print_leaf(struct extent_buffer *l);
-void btrfs_print_tree(struct extent_buffer *c);
+void btrfs_print_tree(struct extent_buffer *c, bool follow);
+
#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 5859f7d3cf3e..dc6140013ae8 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/hashtable.h>
@@ -393,6 +380,7 @@ static int prop_compression_apply(struct inode *inode,
const char *value,
size_t len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int type;
if (len == 0) {
@@ -403,14 +391,17 @@ static int prop_compression_apply(struct inode *inode,
return 0;
}
- if (!strncmp("lzo", value, 3))
+ if (!strncmp("lzo", value, 3)) {
type = BTRFS_COMPRESS_LZO;
- else if (!strncmp("zlib", value, 4))
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ } else if (!strncmp("zlib", value, 4)) {
type = BTRFS_COMPRESS_ZLIB;
- else if (!strncmp("zstd", value, len))
+ } else if (!strncmp("zstd", value, len)) {
type = BTRFS_COMPRESS_ZSTD;
- else
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ } else {
return -EINVAL;
+ }
BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 100f18829d50..618815b4f9d5 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_PROPS_H
-#define __BTRFS_PROPS_H
+#ifndef BTRFS_PROPS_H
+#define BTRFS_PROPS_H
#include "ctree.h"
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f583f13ff26e..1874a6d2e6f5 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 STRATO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -24,6 +11,7 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
+#include <linux/sizes.h>
#include "ctree.h"
#include "transaction.h"
@@ -1894,8 +1882,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
- trace_qgroup_update_counters(fs_info, qg->qgroupid,
- cur_old_count, cur_new_count);
+ trace_qgroup_update_counters(fs_info, qg, cur_old_count,
+ cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
@@ -2026,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
BUG_ON(!fs_info->quota_root);
- trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
- nr_old_roots, nr_new_roots);
+ trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
+ num_bytes, nr_old_roots, nr_new_roots);
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
@@ -2388,8 +2376,21 @@ out:
return ret;
}
-static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
+/*
+ * Two limits to commit transaction in advance.
+ *
+ * For RATIO, it will be 1/RATIO of the remaining limit
+ * (excluding data and prealloc meta) as threshold.
+ * For SIZE, it will be in byte unit as threshold.
+ */
+#define QGROUP_PERTRANS_RATIO 32
+#define QGROUP_PERTRANS_SIZE SZ_32M
+static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
+ const struct btrfs_qgroup *qg, u64 num_bytes)
{
+ u64 limit;
+ u64 threshold;
+
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2398,6 +2399,31 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
+ /*
+ * Even if we passed the check, it's better to check if reservation
+ * for meta_pertrans is pushing us near limit.
+ * If there is too much pertrans reservation or it's near the limit,
+ * let's try commit transaction to free some, using transaction_kthread
+ */
+ if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
+ BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
+ if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+ limit = qg->max_excl;
+ else
+ limit = qg->max_rfer;
+ threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
+ qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
+ QGROUP_PERTRANS_RATIO;
+ threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+
+ /*
+ * Use transaction_kthread to commit transaction, so we no
+ * longer need to bother nested transaction nor lock context.
+ */
+ if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+ btrfs_commit_transaction_locksafe(fs_info);
+ }
+
return true;
}
@@ -2447,7 +2473,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
@@ -2554,6 +2580,21 @@ out:
}
/*
+ * Check if the leaf is the last leaf. Which means all node pointers
+ * are at their last position.
+ */
+static bool is_last_leaf(struct btrfs_path *path)
+{
+ int i;
+
+ for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
+ return false;
+ }
+ return true;
+}
+
+/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
*/
@@ -2564,8 +2605,8 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
+ bool done;
int slot;
int ret;
@@ -2594,12 +2635,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
+ done = is_last_leaf(path);
btrfs_item_key_to_cpu(path->nodes[0], &found,
btrfs_header_nritems(path->nodes[0]) - 1);
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
if (!scratch_leaf) {
ret = -ENOMEM;
@@ -2638,8 +2679,9 @@ out:
btrfs_tree_read_unlock_blocking(scratch_leaf);
free_extent_buffer(scratch_leaf);
}
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ if (done && !ret)
+ ret = 1;
return ret;
}
@@ -2655,6 +2697,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path = btrfs_alloc_path();
if (!path)
goto out;
+ /*
+ * Rescan should only search for commit root, and any later difference
+ * should be recorded by qgroup
+ */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
err = 0;
while (!err && !btrfs_fs_closing(fs_info)) {
@@ -2734,26 +2782,36 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
{
int ret = 0;
- if (!init_flags &&
- (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
- !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
- ret = -EINVAL;
- goto err;
+ if (!init_flags) {
+ /* we're resuming qgroup rescan at mount time */
+ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN))
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup is not enabled");
+ else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup rescan is not queued");
+ return -EINVAL;
}
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
if (init_flags) {
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ btrfs_warn(fs_info,
+ "qgroup rescan is already in progress");
ret = -EINPROGRESS;
- else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ } else if (!(fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAG_ON)) {
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ }
if (ret) {
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- goto err;
+ return ret;
}
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
@@ -2772,13 +2830,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_init_work(&fs_info->qgroup_rescan_work,
btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
-
- if (ret) {
-err:
- btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
- return ret;
- }
-
return 0;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index e63e2d497a8e..d60dd06445ce 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2014 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_QGROUP__
-#define __BTRFS_QGROUP__
+#ifndef BTRFS_QGROUP_H
+#define BTRFS_QGROUP_H
#include "ulist.h"
#include "delayed-ref.h"
@@ -341,4 +328,5 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
-#endif /* __BTRFS_QGROUP__ */
+
+#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c3a2bc8af675..5e4ad134b9ad 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2012 Fusion-io All rights reserved.
* Copyright (C) 2012 Intel Corp. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/bio.h>
@@ -175,6 +163,12 @@ struct btrfs_raid_bio {
* bitmap to record which horizontal stripe has data
*/
unsigned long *dbitmap;
+
+ /* allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+
+ /* allocated with stripe_npages-many bits for finish_*() calls */
+ unsigned long *finish_pbitmap;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -993,9 +987,14 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
- rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
- DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
- sizeof(long), GFP_NOFS);
+ rbio = kzalloc(sizeof(*rbio) +
+ sizeof(*rbio->stripe_pages) * num_pages +
+ sizeof(*rbio->bio_pages) * num_pages +
+ sizeof(*rbio->finish_pointers) * real_stripes +
+ sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
+ sizeof(*rbio->finish_pbitmap) *
+ BITS_TO_LONGS(stripe_npages),
+ GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -1017,13 +1016,20 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
atomic_set(&rbio->stripes_pending, 0);
/*
- * the stripe_pages and bio_pages array point to the extra
+ * the stripe_pages, bio_pages, etc arrays point to the extra
* memory we allocated past the end of the rbio
*/
p = rbio + 1;
- rbio->stripe_pages = p;
- rbio->bio_pages = p + sizeof(struct page *) * num_pages;
- rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
+#define CONSUME_ALLOC(ptr, count) do { \
+ ptr = p; \
+ p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
+ } while (0)
+ CONSUME_ALLOC(rbio->stripe_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_pages, num_pages);
+ CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
+ CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
+ CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
+#undef CONSUME_ALLOC
if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
@@ -1192,7 +1198,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_bio *bbio = rbio->bbio;
- void *pointers[rbio->real_stripes];
+ void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
@@ -2362,8 +2368,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
struct btrfs_bio *bbio = rbio->bbio;
- void *pointers[rbio->real_stripes];
- DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+ void **pointers = rbio->finish_pointers;
+ unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 4ee4fe346838..f5d4c13a8dbc 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -1,24 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2012 Fusion-io All rights reserved.
* Copyright (C) 2012 Intel Corp. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_RAID56__
-#define __BTRFS_RAID56__
+#ifndef BTRFS_RAID56_H
+#define BTRFS_RAID56_H
+
static inline int nr_parity_stripes(struct map_lookup *map)
{
if (map->type & BTRFS_BLOCK_GROUP_RAID5)
@@ -65,4 +53,5 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+
#endif
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 9e111e4576d4..a97dc74a4d3d 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -1,21 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+#ifndef BTRFS_RCU_STRING_H
+#define BTRFS_RCU_STRING_H
+
struct rcu_string {
struct rcu_head rcu;
char str[0];
@@ -54,3 +44,5 @@ static inline void rcu_string_free(struct rcu_string *str)
struct rcu_string *__str = rcu_dereference(rcu_str); \
__str->str; \
})
+
+#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a52dd12af648..40f1bcef394d 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 STRATO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 35fab67dcbe8..e5b9e596bb92 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2014 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 3bf02ce0e1e2..b7d2a4edfdb7 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2014 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __REF_VERIFY__
-#define __REF_VERIFY__
+
+#ifndef BTRFS_REF_VERIFY_H
+#define BTRFS_REF_VERIFY_H
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
@@ -59,4 +47,5 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
}
#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
-#endif /* _REF_VERIFY__ */
+
+#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4874c09f6d3c..879b76fa881a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2009 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -1854,7 +1841,7 @@ again:
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
- btrfs_node_key_to_cpu(parent, &key, slot);
+ btrfs_node_key_to_cpu(parent, &first_key, slot);
if (level <= max_level) {
eb = path->nodes[level];
@@ -4312,7 +4299,7 @@ out:
return inode;
}
-static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
+static struct reloc_control *alloc_reloc_control(void)
{
struct reloc_control *rc;
@@ -4357,7 +4344,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
DESCRIBE_FLAG(RAID5, "raid5");
DESCRIBE_FLAG(RAID6, "raid6");
if (flags)
- snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
+ snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
#undef DESCRIBE_FLAG
}
@@ -4379,7 +4366,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
- rc = alloc_reloc_control(fs_info);
+ rc = alloc_reloc_control();
if (!rc)
return -ENOMEM;
@@ -4575,7 +4562,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = alloc_reloc_control(fs_info);
+ rc = alloc_reloc_control();
if (!rc) {
err = -ENOMEM;
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index aab0194efe46..6db3bda44aa5 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/err.h>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1a2066ac6fe7..a59005862010 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011, 2012 STRATO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/blkdev.h>
@@ -3997,6 +3984,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
+ trace_btrfs_add_unused_block_group(cache);
list_add_tail(&cache->bg_list,
&fs_info->unused_bgs);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1f5748c7d1c7..c47f62b19226 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2012 Alexander Block. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/bsearch.h>
@@ -248,6 +235,7 @@ struct orphan_dir_info {
struct rb_node node;
u64 ino;
u64 gen;
+ u64 last_dir_index_offset;
};
struct name_cache_entry {
@@ -2857,12 +2845,6 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
struct rb_node *parent = NULL;
struct orphan_dir_info *entry, *odi;
- odi = kmalloc(sizeof(*odi), GFP_KERNEL);
- if (!odi)
- return ERR_PTR(-ENOMEM);
- odi->ino = dir_ino;
- odi->gen = 0;
-
while (*p) {
parent = *p;
entry = rb_entry(parent, struct orphan_dir_info, node);
@@ -2871,11 +2853,17 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
} else if (dir_ino > entry->ino) {
p = &(*p)->rb_right;
} else {
- kfree(odi);
return entry;
}
}
+ odi = kmalloc(sizeof(*odi), GFP_KERNEL);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = 0;
+ odi->last_dir_index_offset = 0;
+
rb_link_node(&odi->node, parent, p);
rb_insert_color(&odi->node, &sctx->orphan_dirs);
return odi;
@@ -2930,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
struct btrfs_key found_key;
struct btrfs_key loc;
struct btrfs_dir_item *di;
+ struct orphan_dir_info *odi = NULL;
/*
* Don't try to rmdir the top/root subvolume dir.
@@ -2944,6 +2933,11 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+
+ odi = get_orphan_dir_info(sctx, dir);
+ if (odi)
+ key.offset = odi->last_dir_index_offset;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -2971,30 +2965,33 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
dm = get_waiting_dir_move(sctx, loc.objectid);
if (dm) {
- struct orphan_dir_info *odi;
-
odi = add_orphan_dir_info(sctx, dir);
if (IS_ERR(odi)) {
ret = PTR_ERR(odi);
goto out;
}
odi->gen = dir_gen;
+ odi->last_dir_index_offset = found_key.offset;
dm->rmdir_ino = dir;
ret = 0;
goto out;
}
if (loc.objectid > send_progress) {
- struct orphan_dir_info *odi;
-
- odi = get_orphan_dir_info(sctx, dir);
- free_orphan_dir_info(sctx, odi);
+ odi = add_orphan_dir_info(sctx, dir);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ odi->last_dir_index_offset = found_key.offset;
ret = 0;
goto out;
}
path->slots[0]++;
}
+ free_orphan_dir_info(sctx, odi);
ret = 1;
@@ -3272,13 +3269,16 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
if (rmdir_ino) {
struct orphan_dir_info *odi;
+ u64 gen;
odi = get_orphan_dir_info(sctx, rmdir_ino);
if (!odi) {
/* already deleted */
goto finish;
}
- ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
+ gen = odi->gen;
+
+ ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
if (ret < 0)
goto out;
if (!ret)
@@ -3289,13 +3289,12 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
ret = -ENOMEM;
goto out;
}
- ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+ ret = get_cur_path(sctx, rmdir_ino, gen, name);
if (ret < 0)
goto out;
ret = send_rmdir(sctx, name);
if (ret < 0)
goto out;
- free_orphan_dir_info(sctx, odi);
}
finish:
@@ -5249,6 +5248,10 @@ static int send_write_or_clone(struct send_ctx *sctx,
len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
}
+ if (offset >= sctx->cur_inode_size) {
+ ret = 0;
+ goto out;
+ }
if (offset + len > sctx->cur_inode_size)
len = sctx->cur_inode_size - offset;
if (len == 0) {
@@ -6463,7 +6466,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
*/
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
- "send_in_progres unbalanced %d root %llu",
+ "send_in_progress unbalanced %d root %llu",
root->send_in_progress, root->root_key.objectid);
spin_unlock(&root->root_item_lock);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 3aa4bc55754f..ead397f7034f 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -1,22 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2012 Alexander Block. All rights reserved.
* Copyright (C) 2012 STRATO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+#ifndef BTRFS_SEND_H
+#define BTRFS_SEND_H
+
#include "ctree.h"
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -132,3 +122,5 @@ enum {
#ifdef __KERNEL__
long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
#endif
+
+#endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 5e2b92d83617..b7b4acb12833 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/highmem.h>
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 170baef49fae..81107ad49f3a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/blkdev.h>
@@ -336,6 +323,7 @@ enum {
Opt_ssd, Opt_nossd,
Opt_ssd_spread, Opt_nossd_spread,
Opt_subvol,
+ Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
@@ -401,6 +389,7 @@ static const match_table_t tokens = {
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd_spread, "nossd_spread"},
{Opt_subvol, "subvol=%s"},
+ {Opt_subvol_empty, "subvol="},
{Opt_subvolid, "subvolid=%s"},
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
@@ -474,6 +463,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
+ case Opt_subvol_empty:
case Opt_subvolid:
case Opt_subvolrootid:
case Opt_device:
@@ -1795,10 +1785,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
ret = btrfs_parse_options(fs_info, data, *flags);
- if (ret) {
- ret = -EINVAL;
+ if (ret)
goto restore;
- }
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ca067471cd46..4a4e960c7c66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -223,12 +210,42 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
NULL
};
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features lists all available features of this kernel while
+ * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
+ * can be changed online.
+ */
static const struct attribute_group btrfs_feature_attr_group = {
.name = "features",
.is_visible = btrfs_feature_visible,
.attrs = btrfs_supported_feature_attrs,
};
+static ssize_t rmdir_subvol_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "0\n");
+}
+BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+
+static struct attribute *btrfs_supported_static_feature_attrs[] = {
+ BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ NULL
+};
+
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_feature_attr_group
+ */
+static const struct attribute_group btrfs_static_feature_attr_group = {
+ .name = "features",
+ .attrs = btrfs_supported_static_feature_attrs,
+};
+
static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
{
u64 val;
@@ -527,10 +544,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
}
#define NUM_FEATURE_BITS 64
-static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
-static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+#define BTRFS_FEATURE_NAME_MAX 13
+static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
+static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
-static const u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[FEAT_MAX] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
[FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
@@ -602,7 +620,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
return;
}
- list_for_each_entry(fs_devs, fs_uuids, list) {
+ list_for_each_entry(fs_devs, fs_uuids, fs_list) {
__btrfs_sysfs_remove_fsid(fs_devs);
}
}
@@ -622,7 +640,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
-const char * const btrfs_feature_set_names[3] = {
+const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_COMPAT] = "compat",
[FEAT_COMPAT_RO] = "compat_ro",
[FEAT_INCOMPAT] = "incompat",
@@ -686,7 +704,7 @@ static void init_feature_attrs(void)
if (fa->kobj_attr.attr.name)
continue;
- snprintf(name, 13, "%s:%u",
+ snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u",
btrfs_feature_set_names[set], i);
fa->kobj_attr.attr.name = name;
@@ -913,8 +931,15 @@ int __init btrfs_init_sysfs(void)
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
if (ret)
goto out2;
+ ret = sysfs_merge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ if (ret)
+ goto out_remove_group;
return 0;
+
+out_remove_group:
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
out2:
debugfs_remove_recursive(btrfs_debugfs_root_dentry);
out1:
@@ -925,6 +950,8 @@ out1:
void __cold btrfs_exit_sysfs(void)
{
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
debugfs_remove_recursive(btrfs_debugfs_root_dentry);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 80457f31c29f..c6ee600aff89 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BTRFS_SYSFS_H_
-#define _BTRFS_SYSFS_H_
+
+#ifndef BTRFS_SYSFS_H
+#define BTRFS_SYSFS_H
/*
* Data exported through sysfs
@@ -8,7 +9,7 @@
extern u64 btrfs_debugfs_test;
enum btrfs_feature_set {
- FEAT_COMPAT,
+ FEAT_COMPAT = 0,
FEAT_COMPAT_RO,
FEAT_INCOMPAT,
FEAT_MAX
@@ -76,7 +77,7 @@ attr_to_btrfs_feature_attr(struct attribute *attr)
}
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-extern const char * const btrfs_feature_set_names[3];
+extern const char * const btrfs_feature_set_names[FEAT_MAX];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
@@ -90,4 +91,4 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
-#endif /* _BTRFS_SYSFS_H_ */
+#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index e74278170806..db72b3b6209e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -232,11 +219,13 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
kfree(cache);
}
-void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
memset(trans, 0, sizeof(*trans));
trans->transid = 1;
trans->type = __TRANS_DUMMY;
+ trans->fs_info = fs_info;
}
int btrfs_run_sanity_tests(void)
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index bc0615bac3cc..70ff9f9d86a1 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -1,28 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_TESTS
-#define __BTRFS_TESTS
+#ifndef BTRFS_TESTS_H
+#define BTRFS_TESTS_H
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_run_sanity_tests(void);
-#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
+#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
struct btrfs_root;
struct btrfs_trans_handle;
@@ -41,7 +29,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group_cache *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
{
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index b9142c614114..7d72eab6d32c 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/slab.h>
@@ -39,31 +26,31 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
u32 value_len = strlen(value);
int ret = 0;
- test_msg("Running btrfs_split_item tests\n");
+ test_msg("running btrfs_split_item tests");
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Could not allocate fs_info\n");
+ test_err("could not allocate fs_info");
return -ENOMEM;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Could not allocate root\n");
+ test_err("could not allocate root");
ret = PTR_ERR(root);
goto out;
}
path = btrfs_alloc_path();
if (!path) {
- test_msg("Could not allocate path\n");
+ test_err("could not allocate path");
ret = -ENOMEM;
goto out;
}
path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!eb) {
- test_msg("Could not allocate dummy buffer\n");
+ test_err("could not allocate dummy buffer");
ret = -ENOMEM;
goto out;
}
@@ -88,7 +75,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
*/
ret = btrfs_split_item(NULL, root, path, &key, 17);
if (ret) {
- test_msg("Split item failed %d\n", ret);
+ test_err("split item failed %d", ret);
goto out;
}
@@ -99,14 +86,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 0);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 0) {
- test_msg("Invalid key at slot 0\n");
+ test_err("invalid key at slot 0");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(0);
if (btrfs_item_size(eb, item) != strlen(split1)) {
- test_msg("Invalid len in the first split\n");
+ test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
}
@@ -114,8 +101,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
strlen(split1));
if (memcmp(buf, split1, strlen(split1))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the first split have='%.*s' want '%s'\n",
+ test_err(
+"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'",
(int)strlen(split1), buf, split1);
ret = -EINVAL;
goto out;
@@ -124,14 +111,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 1);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 3) {
- test_msg("Invalid key at slot 1\n");
+ test_err("invalid key at slot 1");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(1);
if (btrfs_item_size(eb, item) != strlen(split2)) {
- test_msg("Invalid len in the second split\n");
+ test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
}
@@ -139,8 +126,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
strlen(split2));
if (memcmp(buf, split2, strlen(split2))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the second split\n");
+ test_err(
+ "data in the buffer doesn't match what it should in the second split");
ret = -EINVAL;
goto out;
}
@@ -149,21 +136,21 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
/* Do it again so we test memmoving the other items in the leaf */
ret = btrfs_split_item(NULL, root, path, &key, 4);
if (ret) {
- test_msg("Second split item failed %d\n", ret);
+ test_err("second split item failed %d", ret);
goto out;
}
btrfs_item_key_to_cpu(eb, &key, 0);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 0) {
- test_msg("Invalid key at slot 0\n");
+ test_err("invalid key at slot 0");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(0);
if (btrfs_item_size(eb, item) != strlen(split3)) {
- test_msg("Invalid len in the first split\n");
+ test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
}
@@ -171,8 +158,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
strlen(split3));
if (memcmp(buf, split3, strlen(split3))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the third split");
+ test_err(
+ "data in the buffer doesn't match what it should in the third split");
ret = -EINVAL;
goto out;
}
@@ -180,14 +167,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 1);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 1) {
- test_msg("Invalid key at slot 1\n");
+ test_err("invalid key at slot 1");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(1);
if (btrfs_item_size(eb, item) != strlen(split4)) {
- test_msg("Invalid len in the second split\n");
+ test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
}
@@ -195,8 +182,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
strlen(split4));
if (memcmp(buf, split4, strlen(split4))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the fourth split\n");
+ test_err(
+ "data in the buffer doesn't match what it should in the fourth split");
ret = -EINVAL;
goto out;
}
@@ -204,14 +191,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
btrfs_item_key_to_cpu(eb, &key, 2);
if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset != 3) {
- test_msg("Invalid key at slot 2\n");
+ test_err("invalid key at slot 2");
ret = -EINVAL;
goto out;
}
item = btrfs_item_nr(2);
if (btrfs_item_size(eb, item) != strlen(split2)) {
- test_msg("Invalid len in the second split\n");
+ test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
}
@@ -219,8 +206,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
strlen(split2));
if (memcmp(buf, split2, strlen(split2))) {
- test_msg("Data in the buffer doesn't match what it should "
- "in the last chunk\n");
+ test_err(
+ "data in the buffer doesn't match what it should in the last chunk");
ret = -EINVAL;
goto out;
}
@@ -233,6 +220,6 @@ out:
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize)
{
- test_msg("Running extent buffer operation tests\n");
+ test_msg("running extent buffer operation tests");
return test_btrfs_split_item(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 2e7f64a3b22b..d9269a531a4d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/pagemap.h>
@@ -59,7 +46,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
cond_resched();
loops++;
if (loops > 100000) {
- printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+ printk(KERN_ERR
+ "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
+ start, end, nr_pages, ret);
break;
}
}
@@ -79,11 +68,11 @@ static int test_find_delalloc(u32 sectorsize)
u64 found;
int ret = -EINVAL;
- test_msg("Running find delalloc tests\n");
+ test_msg("running find delalloc tests");
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Failed to allocate test inode\n");
+ test_err("failed to allocate test inode");
return -ENOMEM;
}
@@ -97,7 +86,7 @@ static int test_find_delalloc(u32 sectorsize)
for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
- test_msg("Failed to allocate test page\n");
+ test_err("failed to allocate test page");
ret = -ENOMEM;
goto out;
}
@@ -120,11 +109,11 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Should have found at least one delalloc\n");
+ test_err("should have found at least one delalloc");
goto out_bits;
}
if (start != 0 || end != (sectorsize - 1)) {
- test_msg("Expected start 0 end %u, got start %llu end %llu\n",
+ test_err("expected start 0 end %u, got start %llu end %llu",
sectorsize - 1, start, end);
goto out_bits;
}
@@ -142,7 +131,7 @@ static int test_find_delalloc(u32 sectorsize)
locked_page = find_lock_page(inode->i_mapping,
test_start >> PAGE_SHIFT);
if (!locked_page) {
- test_msg("Couldn't find the locked page\n");
+ test_err("couldn't find the locked page");
goto out_bits;
}
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
@@ -151,17 +140,17 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Couldn't find delalloc in our range\n");
+ test_err("couldn't find delalloc in our range");
goto out_bits;
}
if (start != test_start || end != max_bytes - 1) {
- test_msg("Expected start %Lu end %Lu, got start %Lu, end "
- "%Lu\n", test_start, max_bytes - 1, start, end);
+ test_err("expected start %llu end %llu, got start %llu, end %llu",
+ test_start, max_bytes - 1, start, end);
goto out_bits;
}
if (process_page_range(inode, start, end,
PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
- test_msg("There were unlocked pages in the range\n");
+ test_err("there were unlocked pages in the range");
goto out_bits;
}
unlock_extent(&tmp, start, end);
@@ -177,7 +166,7 @@ static int test_find_delalloc(u32 sectorsize)
locked_page = find_lock_page(inode->i_mapping, test_start >>
PAGE_SHIFT);
if (!locked_page) {
- test_msg("Couldn't find the locked page\n");
+ test_err("couldn't find the locked page");
goto out_bits;
}
start = test_start;
@@ -185,11 +174,11 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (found) {
- test_msg("Found range when we shouldn't have\n");
+ test_err("found range when we shouldn't have");
goto out_bits;
}
if (end != (u64)-1) {
- test_msg("Did not return the proper end offset\n");
+ test_err("did not return the proper end offset");
goto out_bits;
}
@@ -206,17 +195,17 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Didn't find our range\n");
+ test_err("didn't find our range");
goto out_bits;
}
if (start != test_start || end != total_dirty - 1) {
- test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+ test_err("expected start %llu end %llu, got start %llu end %llu",
test_start, total_dirty - 1, start, end);
goto out_bits;
}
if (process_page_range(inode, start, end,
PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
- test_msg("Pages in range were not all locked\n");
+ test_err("pages in range were not all locked");
goto out_bits;
}
unlock_extent(&tmp, start, end);
@@ -228,7 +217,7 @@ static int test_find_delalloc(u32 sectorsize)
page = find_get_page(inode->i_mapping,
(max_bytes + SZ_1M) >> PAGE_SHIFT);
if (!page) {
- test_msg("Couldn't find our page\n");
+ test_err("couldn't find our page");
goto out_bits;
}
ClearPageDirty(page);
@@ -247,18 +236,17 @@ static int test_find_delalloc(u32 sectorsize)
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
- test_msg("Didn't find our range\n");
+ test_err("didn't find our range");
goto out_bits;
}
if (start != test_start && end != test_start + PAGE_SIZE - 1) {
- test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
- test_start, test_start + PAGE_SIZE - 1, start,
- end);
+ test_err("expected start %llu end %llu, got start %llu end %llu",
+ test_start, test_start + PAGE_SIZE - 1, start, end);
goto out_bits;
}
if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
PROCESS_UNLOCK)) {
- test_msg("Pages in range were not all locked\n");
+ test_err("pages in range were not all locked");
goto out_bits;
}
ret = 0;
@@ -284,14 +272,14 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb,
bit = !!test_bit(i, bitmap);
bit1 = !!extent_buffer_test_bit(eb, 0, i);
if (bit1 != bit) {
- test_msg("Bits do not match\n");
+ test_err("bits do not match");
return -EINVAL;
}
bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
i % BITS_PER_BYTE);
if (bit1 != bit) {
- test_msg("Offset bits do not match\n");
+ test_err("offset bits do not match");
return -EINVAL;
}
}
@@ -308,7 +296,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
memset(bitmap, 0, len);
memzero_extent_buffer(eb, 0, len);
if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_msg("Bitmap was not zeroed\n");
+ test_err("bitmap was not zeroed");
return -EINVAL;
}
@@ -316,7 +304,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Setting all bits failed\n");
+ test_err("setting all bits failed");
return ret;
}
@@ -324,7 +312,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Clearing all bits failed\n");
+ test_err("clearing all bits failed");
return ret;
}
@@ -337,7 +325,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
sizeof(long) * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Setting straddling pages failed\n");
+ test_err("setting straddling pages failed");
return ret;
}
@@ -350,7 +338,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
sizeof(long) * BITS_PER_BYTE);
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Clearing straddling pages failed\n");
+ test_err("clearing straddling pages failed");
return ret;
}
}
@@ -374,7 +362,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
ret = check_eb_bitmap(bitmap, eb, len);
if (ret) {
- test_msg("Random bit pattern failed\n");
+ test_err("random bit pattern failed");
return ret;
}
@@ -389,7 +377,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
struct extent_buffer *eb;
int ret;
- test_msg("Running extent buffer bitmap tests\n");
+ test_msg("running extent buffer bitmap tests");
/*
* In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
@@ -402,13 +390,13 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
- test_msg("Couldn't allocate test bitmap\n");
+ test_err("couldn't allocate test bitmap");
return -ENOMEM;
}
eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
if (!eb) {
- test_msg("Couldn't allocate test extent buffer\n");
+ test_err("couldn't allocate test extent buffer");
kfree(bitmap);
return -ENOMEM;
}
@@ -421,7 +409,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
free_extent_buffer(eb);
eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
if (!eb) {
- test_msg("Couldn't allocate test extent buffer\n");
+ test_err("couldn't allocate test extent buffer");
kfree(bitmap);
return -ENOMEM;
}
@@ -437,7 +425,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
- test_msg("Running extent I/O tests\n");
+ test_msg("running extent I/O tests");
ret = test_find_delalloc(sectorsize);
if (ret)
@@ -445,6 +433,6 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
ret = test_eb_bitmaps(sectorsize, nodesize);
out:
- test_msg("Extent I/O tests finished\n");
+ test_msg("extent I/O tests finished");
return ret;
}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index c23bd00bdd92..385a5316e4bf 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -32,8 +19,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
- test_msg(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n",
+ test_err(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
em->start, em->len, em->block_start,
em->block_len, refcount_read(&em->refs));
@@ -60,7 +47,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
* ->add_extent_mapping(0, 16K)
* -> #handle -EEXIST
*/
-static void test_case_1(struct extent_map_tree *em_tree)
+static void test_case_1(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
struct extent_map *em;
u64 start = 0;
@@ -103,14 +91,14 @@ static void test_case_1(struct extent_map_tree *em_tree)
em->len = len;
em->block_start = start;
em->block_len = len;
- ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
if (ret)
- test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
+ test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
if (em &&
(em->start != 0 || extent_map_end(em) != SZ_16K ||
em->block_start != 0 || em->block_len != SZ_16K))
- test_msg(
-"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ test_err(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
free_extent_map(em);
@@ -125,7 +113,8 @@ out:
* Reading the inline ending up with EEXIST, ie. read an inline
* extent and discard page cache and read it again.
*/
-static void test_case_2(struct extent_map_tree *em_tree)
+static void test_case_2(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
struct extent_map *em;
int ret;
@@ -166,14 +155,14 @@ static void test_case_2(struct extent_map_tree *em_tree)
em->len = SZ_1K;
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
- ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
if (ret)
- test_msg("case2 [0 1K]: ret %d\n", ret);
+ test_err("case2 [0 1K]: ret %d", ret);
if (em &&
(em->start != 0 || extent_map_end(em) != SZ_1K ||
em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
- test_msg(
-"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+ test_err(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
em->block_len);
free_extent_map(em);
@@ -182,7 +171,8 @@ out:
free_extent_map_tree(em_tree);
}
-static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
+static void __test_case_3(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree, u64 start)
{
struct extent_map *em;
u64 len = SZ_4K;
@@ -211,9 +201,9 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_16K;
- ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
if (ret)
- test_msg("case3 [0x%llx 0x%llx): ret %d\n",
+ test_err("case3 [0x%llx 0x%llx): ret %d",
start, start + len, ret);
/*
* Since bytes within em are contiguous, em->block_start is identical to
@@ -222,8 +212,8 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
if (em &&
(start < em->start || start + len > extent_map_end(em) ||
em->start != em->block_start || em->len != em->block_len))
- test_msg(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ test_err(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
free_extent_map(em);
@@ -248,14 +238,16 @@ out:
* -> add_extent_mapping()
* -> add_extent_mapping()
*/
-static void test_case_3(struct extent_map_tree *em_tree)
+static void test_case_3(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
- __test_case_3(em_tree, 0);
- __test_case_3(em_tree, SZ_8K);
- __test_case_3(em_tree, (12 * 1024ULL));
+ __test_case_3(fs_info, em_tree, 0);
+ __test_case_3(fs_info, em_tree, SZ_8K);
+ __test_case_3(fs_info, em_tree, (12 * 1024ULL));
}
-static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
+static void __test_case_4(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree, u64 start)
{
struct extent_map *em;
u64 len = SZ_4K;
@@ -296,14 +288,14 @@ static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
em->len = SZ_32K;
em->block_start = 0;
em->block_len = SZ_32K;
- ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
if (ret)
- test_msg("case4 [0x%llx 0x%llx): ret %d\n",
+ test_err("case4 [0x%llx 0x%llx): ret %d",
start, len, ret);
if (em &&
(start < em->start || start + len > extent_map_end(em)))
- test_msg(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+ test_err(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
start, len, ret, em->start, em->len, em->block_start,
em->block_len);
free_extent_map(em);
@@ -337,30 +329,45 @@ out:
* # handle -EEXIST when adding
* # [0, 32K)
*/
-static void test_case_4(struct extent_map_tree *em_tree)
+static void test_case_4(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
{
- __test_case_4(em_tree, 0);
- __test_case_4(em_tree, SZ_4K);
+ __test_case_4(fs_info, em_tree, 0);
+ __test_case_4(fs_info, em_tree, SZ_4K);
}
int btrfs_test_extent_map(void)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct extent_map_tree *em_tree;
- test_msg("Running extent_map tests\n");
+ test_msg("running extent_map tests");
+
+ /*
+ * Note: the fs_info is not set up completely, we only need
+ * fs_info::fsid for the tracepoint.
+ */
+ fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info");
+ return -ENOMEM;
+ }
em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
if (!em_tree)
/* Skip the test on error. */
- return 0;
+ goto out;
extent_map_tree_init(em_tree);
- test_case_1(em_tree);
- test_case_2(em_tree);
- test_case_3(em_tree);
- test_case_4(em_tree);
+ test_case_1(fs_info, em_tree);
+ test_case_2(fs_info, em_tree);
+ test_case_3(fs_info, em_tree);
+ test_case_4(fs_info, em_tree);
kfree(em_tree);
+out:
+ btrfs_free_dummy_fs_info(fs_info);
+
return 0;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index eca6412d42bd..5c2f77e9439b 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/slab.h>
@@ -33,63 +20,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
{
int ret = 0;
- test_msg("Running extent only tests\n");
+ test_msg("running extent only tests");
/* First just make sure we can remove an entire entry */
ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error adding initial extents %d\n", ret);
+ test_err("error adding initial extents %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error removing extent %d\n", ret);
+ test_err("error removing extent %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_4M)) {
- test_msg("Full remove left some lingering space\n");
+ test_err("full remove left some lingering space");
return -1;
}
/* Ok edge and middle cases now */
ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error adding half extent %d\n", ret);
+ test_err("error adding half extent %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
if (ret) {
- test_msg("Error removing tail end %d\n", ret);
+ test_err("error removing tail end %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
- test_msg("Error removing front end %d\n", ret);
+ test_err("error removing front end %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
if (ret) {
- test_msg("Error removing middle piece %d\n", ret);
+ test_err("error removing middle piece %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_1M)) {
- test_msg("Still have space at the front\n");
+ test_err("still have space at the front");
return -1;
}
if (test_check_exists(cache, SZ_2M, 4096)) {
- test_msg("Still have space in the middle\n");
+ test_err("still have space in the middle");
return -1;
}
if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
- test_msg("Still have space at the end\n");
+ test_err("still have space at the end");
return -1;
}
@@ -105,34 +92,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
u64 next_bitmap_offset;
int ret;
- test_msg("Running bitmap only tests\n");
+ test_msg("running bitmap only tests");
ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't create a bitmap entry %d\n", ret);
+ test_err("couldn't create a bitmap entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
- test_msg("Error removing bitmap full range %d\n", ret);
+ test_err("error removing bitmap full range %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_4M)) {
- test_msg("Left some space in bitmap\n");
+ test_err("left some space in bitmap");
return -1;
}
ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add to our bitmap entry %d\n", ret);
+ test_err("couldn't add to our bitmap entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
if (ret) {
- test_msg("Couldn't remove middle chunk %d\n", ret);
+ test_err("couldn't remove middle chunk %d", ret);
return ret;
}
@@ -146,19 +133,19 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add space that straddles two bitmaps %d\n",
+ test_err("couldn't add space that straddles two bitmaps %d",
ret);
return ret;
}
ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
if (ret) {
- test_msg("Couldn't remove overlapping space %d\n", ret);
+ test_err("couldn't remove overlapping space %d", ret);
return ret;
}
if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
- test_msg("Left some space when removing overlapping\n");
+ test_err("left some space when removing overlapping");
return -1;
}
@@ -174,7 +161,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
int ret;
- test_msg("Running bitmap and extent tests\n");
+ test_msg("running bitmap and extent tests");
/*
* First let's do something simple, an extent at the same offset as the
@@ -183,42 +170,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
if (ret) {
- test_msg("Couldn't create bitmap entry %d\n", ret);
+ test_err("couldn't create bitmap entry %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
- test_msg("Couldn't remove extent entry %d\n", ret);
+ test_err("couldn't remove extent entry %d", ret);
return ret;
}
if (test_check_exists(cache, 0, SZ_1M)) {
- test_msg("Left remnants after our remove\n");
+ test_err("left remnants after our remove");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
- test_msg("Couldn't re-add extent entry %d\n", ret);
+ test_err("couldn't re-add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
if (ret) {
- test_msg("Couldn't remove from bitmap %d\n", ret);
+ test_err("couldn't remove from bitmap %d", ret);
return ret;
}
if (test_check_exists(cache, SZ_4M, SZ_1M)) {
- test_msg("Left remnants in the bitmap\n");
+ test_err("left remnants in the bitmap");
return -1;
}
@@ -228,18 +215,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add to a bitmap %d\n", ret);
+ test_err("couldn't add to a bitmap %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
if (ret) {
- test_msg("Couldn't remove overlapping space %d\n", ret);
+ test_err("couldn't remove overlapping space %d", ret);
return ret;
}
if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
- test_msg("Left over pieces after removing overlapping\n");
+ test_err("left over pieces after removing overlapping");
return -1;
}
@@ -248,24 +235,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
/* Now with the extent entry offset into the bitmap */
ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add space to the bitmap %d\n", ret);
+ test_err("couldn't add space to the bitmap %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
if (ret) {
- test_msg("Couldn't add extent to the cache %d\n", ret);
+ test_err("couldn't add extent to the cache %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
if (ret) {
- test_msg("Problem removing overlapping space %d\n", ret);
+ test_err("problem removing overlapping space %d", ret);
return ret;
}
if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
- test_msg("Left something behind when removing space");
+ test_err("left something behind when removing space");
return -1;
}
@@ -282,25 +269,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
__btrfs_remove_free_space_cache(cache->free_space_ctl);
ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
- test_msg("Couldn't add bitmap %d\n", ret);
+ test_err("couldn't add bitmap %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
5 * SZ_1M, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
if (ret) {
- test_msg("Failed to free our space %d\n", ret);
+ test_err("failed to free our space %d", ret);
return ret;
}
if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
- test_msg("Left stuff over\n");
+ test_err("left stuff over");
return -1;
}
@@ -314,19 +301,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
if (ret) {
- test_msg("Couldn't add bitmap entry %d\n", ret);
+ test_err("couldn't add bitmap entry %d", ret);
return ret;
}
ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
if (ret) {
- test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+ test_err("error removing bitmap and extent overlapping %d", ret);
return ret;
}
@@ -348,12 +335,14 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
const int num_bitmaps)
{
if (cache->free_space_ctl->free_extents != num_extents) {
- test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ test_err(
+ "incorrect # of extent entries in the cache: %d, expected %d",
cache->free_space_ctl->free_extents, num_extents);
return -EINVAL;
}
if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
- test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ test_err(
+ "incorrect # of extent entries in the cache: %d, expected %d",
cache->free_space_ctl->total_bitmaps, num_bitmaps);
return -EINVAL;
}
@@ -371,7 +360,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
* allocate.
*/
if (cache->free_space_ctl->free_space != 0) {
- test_msg("Cache free space is not 0\n");
+ test_err("cache free space is not 0");
return -EINVAL;
}
@@ -379,7 +368,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
&max_extent_size);
if (offset != 0) {
- test_msg("Space allocation did not fail, returned offset: %llu",
+ test_err("space allocation did not fail, returned offset: %llu",
offset);
return -EINVAL;
}
@@ -415,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
};
const struct btrfs_free_space_op *orig_free_space_ops;
- test_msg("Running space stealing from bitmap to extent\n");
+ test_msg("running space stealing from bitmap to extent");
/*
* For this test, we want to ensure we end up with an extent entry
@@ -443,7 +432,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
@@ -451,7 +440,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
SZ_128M - SZ_512K, 1);
if (ret) {
- test_msg("Couldn't add bitmap entry %d\n", ret);
+ test_err("couldn't add bitmap entry %d", ret);
return ret;
}
@@ -470,17 +459,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
SZ_128M + 768 * SZ_1K,
SZ_128M - 768 * SZ_1K);
if (ret) {
- test_msg("Failed to free part of bitmap space %d\n", ret);
+ test_err("failed to free part of bitmap space %d", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
@@ -490,7 +479,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
SZ_128M - 768 * SZ_1K)) {
- test_msg("Bitmap region not removed from space cache\n");
+ test_err("bitmap region not removed from space cache");
return -EINVAL;
}
@@ -499,7 +488,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* covered by the bitmap, isn't marked as free.
*/
if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
- test_msg("Invalid bitmap region marked as free\n");
+ test_err("invalid bitmap region marked as free");
return -EINVAL;
}
@@ -508,7 +497,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* by the bitmap too, isn't marked as free either.
*/
if (test_check_exists(cache, SZ_128M, SZ_256K)) {
- test_msg("Invalid bitmap region marked as free\n");
+ test_err("invalid bitmap region marked as free");
return -EINVAL;
}
@@ -519,12 +508,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
- test_msg("Bitmap region not marked as free\n");
+ test_err("bitmap region not marked as free");
return -ENOENT;
}
@@ -544,7 +533,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
@@ -563,12 +552,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
- test_msg("Extent region not marked as free\n");
+ test_err("extent region not marked as free");
return -ENOENT;
}
@@ -596,12 +585,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* allocate the whole free space at once.
*/
if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
- test_msg("Expected region not marked as free\n");
+ test_err("expected region not marked as free");
return -ENOENT;
}
if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) {
- test_msg("Cache free space is not 1Mb + %u\n", sectorsize);
+ test_err("cache free space is not 1Mb + %u", sectorsize);
return -EINVAL;
}
@@ -609,7 +598,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
0, SZ_1M, 0,
&max_extent_size);
if (offset != (SZ_128M - SZ_256K)) {
- test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ test_err(
+ "failed to allocate 1Mb from space cache, returned offset is: %llu",
offset);
return -EINVAL;
}
@@ -623,7 +613,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
return ret;
if (cache->free_space_ctl->free_space != sectorsize) {
- test_msg("Cache free space is not %u\n", sectorsize);
+ test_err("cache free space is not %u", sectorsize);
return -EINVAL;
}
@@ -631,7 +621,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
0, sectorsize, 0,
&max_extent_size);
if (offset != (SZ_128M + SZ_16M)) {
- test_msg("Failed to allocate %u, returned offset : %llu\n",
+ test_err("failed to allocate %u, returned offset : %llu",
sectorsize, offset);
return -EINVAL;
}
@@ -653,14 +643,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
if (ret) {
- test_msg("Couldn't add extent entry %d\n", ret);
+ test_err("couldn't add extent entry %d", ret);
return ret;
}
/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
if (ret) {
- test_msg("Couldn't add bitmap entry %d\n", ret);
+ test_err("couldn't add bitmap entry %d", ret);
return ret;
}
@@ -677,17 +667,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
if (ret) {
- test_msg("Failed to free part of bitmap space %d\n", ret);
+ test_err("failed to free part of bitmap space %d", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
- test_msg("Free space range missing\n");
+ test_err("free space range missing");
return -ENOENT;
}
@@ -696,7 +686,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* as free anymore.
*/
if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
- test_msg("Bitmap region not removed from space cache\n");
+ test_err("bitmap region not removed from space cache");
return -EINVAL;
}
@@ -705,7 +695,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* covered by the bitmap, isn't marked as free.
*/
if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
- test_msg("Invalid bitmap region marked as free\n");
+ test_err("invalid bitmap region marked as free");
return -EINVAL;
}
@@ -716,12 +706,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
- test_msg("Bitmap region not marked as free\n");
+ test_err("bitmap region not marked as free");
return -ENOENT;
}
@@ -741,7 +731,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
@@ -752,12 +742,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
*/
ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
if (ret) {
- test_msg("Error adding free space: %d\n", ret);
+ test_err("error adding free space: %d", ret);
return ret;
}
/* Confirm the region is marked as free. */
if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
- test_msg("Extent region not marked as free\n");
+ test_err("extent region not marked as free");
return -ENOENT;
}
@@ -785,19 +775,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
* allocate the whole free space at once.
*/
if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
- test_msg("Expected region not marked as free\n");
+ test_err("expected region not marked as free");
return -ENOENT;
}
if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) {
- test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize);
+ test_err("cache free space is not 1Mb + %u", 2 * sectorsize);
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
&max_extent_size);
if (offset != (SZ_128M - 768 * SZ_1K)) {
- test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ test_err(
+ "failed to allocate 1Mb from space cache, returned offset is: %llu",
offset);
return -EINVAL;
}
@@ -811,7 +802,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
return ret;
if (cache->free_space_ctl->free_space != 2 * sectorsize) {
- test_msg("Cache free space is not %u\n", 2 * sectorsize);
+ test_err("cache free space is not %u", 2 * sectorsize);
return -EINVAL;
}
@@ -819,9 +810,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
0, 2 * sectorsize, 0,
&max_extent_size);
if (offset != SZ_32M) {
- test_msg("Failed to allocate %u, offset: %llu\n",
- 2 * sectorsize,
- offset);
+ test_err("failed to allocate %u, offset: %llu",
+ 2 * sectorsize, offset);
return -EINVAL;
}
@@ -842,7 +832,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
- test_msg("Running btrfs free space cache tests\n");
+ test_msg("running btrfs free space cache tests");
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info)
return -ENOMEM;
@@ -856,7 +846,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
cache = btrfs_alloc_dummy_block_group(fs_info,
BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
if (!cache) {
- test_msg("Couldn't run the tests\n");
+ test_err("couldn't run the tests");
btrfs_free_dummy_fs_info(fs_info);
return 0;
}
@@ -884,6 +874,6 @@ out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
- test_msg("Free space cache tests finished\n");
+ test_msg("free space cache tests finished");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 8444a018cca2..89346da890cf 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2015 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -45,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
info = search_free_space_info(trans, fs_info, cache, path, 0);
if (IS_ERR(info)) {
- test_msg("Could not find free space info\n");
+ test_err("could not find free space info");
ret = PTR_ERR(info);
goto out;
}
@@ -53,7 +40,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
if (extent_count != num_extents) {
- test_msg("Extent count is wrong\n");
+ test_err("extent count is wrong");
ret = -EINVAL;
goto out;
}
@@ -112,7 +99,7 @@ out:
btrfs_release_path(path);
return ret;
invalid:
- test_msg("Free space tree is invalid\n");
+ test_err("free space tree is invalid");
ret = -EINVAL;
goto out;
}
@@ -130,7 +117,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
info = search_free_space_info(trans, fs_info, cache, path, 0);
if (IS_ERR(info)) {
- test_msg("Could not find free space info\n");
+ test_err("could not find free space info");
btrfs_release_path(path);
return PTR_ERR(info);
}
@@ -144,15 +131,15 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
/* Flip it to the other format and check that for good measure. */
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+ ret = convert_free_space_to_extents(trans, cache, path);
if (ret) {
- test_msg("Could not convert to extents\n");
+ test_err("could not convert to extents");
return ret;
}
} else {
- ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+ ret = convert_free_space_to_bitmaps(trans, cache, path);
if (ret) {
- test_msg("Could not convert to bitmaps\n");
+ test_err("could not convert to bitmaps");
return ret;
}
}
@@ -183,11 +170,11 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
const struct free_space_extent extents[] = {};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -207,10 +194,10 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid, alignment);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -230,12 +217,12 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid +
cache->key.offset - alignment,
alignment);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -256,11 +243,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
@@ -279,26 +266,26 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
- cache->key.objectid, alignment);
+ ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -317,27 +304,27 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 2 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -356,34 +343,34 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
- cache->key.objectid, alignment);
+ ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 2 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -404,34 +391,34 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ ret = __remove_from_free_space_tree(trans, cache, path,
cache->key.objectid,
cache->key.offset);
if (ret) {
- test_msg("Could not remove free space\n");
+ test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
- cache->key.objectid, alignment);
+ ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 4 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ ret = __add_to_free_space_tree(trans, cache, path,
cache->key.objectid + 2 * alignment,
alignment);
if (ret) {
- test_msg("Could not add free space\n");
+ test_err("could not add free space");
return ret;
}
@@ -457,14 +444,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
ret = -ENOMEM;
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate dummy root\n");
+ test_err("couldn't allocate dummy root");
ret = PTR_ERR(root);
goto out;
}
@@ -476,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
ret = -ENOMEM;
goto out;
}
@@ -486,7 +473,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
if (!cache) {
- test_msg("Couldn't allocate dummy block group cache\n");
+ test_err("couldn't allocate dummy block group cache");
ret = -ENOMEM;
goto out;
}
@@ -495,26 +482,25 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
cache->needs_free_space = 1;
cache->fs_info = root->fs_info;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, root->fs_info);
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
ret = -ENOMEM;
goto out;
}
- ret = add_block_group_free_space(&trans, root->fs_info, cache);
+ ret = add_block_group_free_space(&trans, cache);
if (ret) {
- test_msg("Could not add block group free space\n");
+ test_err("could not add block group free space");
goto out;
}
if (bitmaps) {
- ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
- cache, path);
+ ret = convert_free_space_to_bitmaps(&trans, cache, path);
if (ret) {
- test_msg("Could not convert block group to bitmaps\n");
+ test_err("could not convert block group to bitmaps");
goto out;
}
}
@@ -523,14 +509,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
if (ret)
goto out;
- ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+ ret = remove_block_group_free_space(&trans, cache);
if (ret) {
- test_msg("Could not remove block group free space\n");
+ test_err("could not remove block group free space");
goto out;
}
if (btrfs_header_nritems(root->node) != 0) {
- test_msg("Free space tree has leftover items\n");
+ test_err("free space tree has leftover items");
ret = -EINVAL;
goto out;
}
@@ -552,14 +538,16 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
if (ret) {
- test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n",
+ test_err(
+ "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
test_func, sectorsize, nodesize, alignment);
test_ret = ret;
}
ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
if (ret) {
- test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n",
+ test_err(
+ "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
test_func, sectorsize, nodesize, alignment);
test_ret = ret;
}
@@ -590,7 +578,7 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
*/
bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE;
- test_msg("Running free space tree tests\n");
+ test_msg("running free space tree tests");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
int ret;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 13420cd19ef0..64043f028820 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -241,7 +228,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Couldn't allocate inode\n");
+ test_err("couldn't allocate inode");
return ret;
}
@@ -251,19 +238,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
goto out;
}
root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
goto out;
}
@@ -281,11 +268,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0);
if (IS_ERR(em)) {
em = NULL;
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
free_extent_map(em);
@@ -300,20 +287,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != 0 || em->len != 5) {
- test_msg("Unexpected extent wanted start 0 len 5, got start "
- "%llu len %llu\n", em->start, em->len);
+ test_err(
+ "unexpected extent wanted start 0 len 5, got start %llu len %llu",
+ em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -321,21 +309,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_INLINE) {
- test_msg("Expected an inline, got %llu\n", em->block_start);
+ test_err("expected an inline, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != (sectorsize - 5)) {
- test_msg("Unexpected extent wanted start %llu len 1, got start "
- "%llu len %llu\n", offset, em->start, em->len);
+ test_err(
+ "unexpected extent wanted start %llu len 1, got start %llu len %llu",
+ offset, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
/*
@@ -348,20 +337,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 4) {
- test_msg("Unexpected extent wanted start %llu len 4, got start "
- "%llu len %llu\n", offset, em->start, em->len);
+ test_err(
+ "unexpected extent wanted start %llu len 4, got start %llu len %llu",
+ offset, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -370,24 +360,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Regular extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize - 1) {
- test_msg("Unexpected extent wanted start %llu len 4095, got "
- "start %llu len %llu\n", offset, em->start, em->len);
+ test_err(
+ "unexpected extent wanted start %llu len 4095, got start %llu len %llu",
+ offset, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -397,25 +388,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* The next 3 are split extents */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -426,21 +417,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -448,31 +439,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
orig_start, em->orig_start);
goto out;
}
disk_bytenr += (em->start - orig_start);
if (em->block_start != disk_bytenr) {
- test_msg("Wrong block start, want %llu, have %llu\n",
+ test_err("wrong block start, want %llu, have %llu",
disk_bytenr, em->block_start);
goto out;
}
@@ -482,26 +473,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
prealloc_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -511,26 +502,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* The next 3 are a half written prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
prealloc_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -541,30 +532,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_HOLE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+ test_err("unexpected orig offset, wanted %llu, have %llu",
orig_start, em->orig_start);
goto out;
}
if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
- test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ test_err("unexpected block start, wanted %llu, have %llu",
disk_bytenr + (em->start - em->orig_start),
em->block_start);
goto out;
@@ -574,31 +565,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != prealloc_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
prealloc_only, em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+ test_err("wrong orig offset, want %llu, have %llu", orig_start,
em->orig_start);
goto out;
}
if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
- test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ test_err("unexpected block start, wanted %llu, have %llu",
disk_bytenr + (em->start - em->orig_start),
em->block_start);
goto out;
@@ -609,31 +600,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Now for the compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u,"
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
compressed_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
em->start, em->orig_start);
goto out;
}
if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
- test_msg("Unexpected compress type, wanted %d, got %d\n",
+ test_err("unexpected compress type, wanted %d, got %d",
BTRFS_COMPRESS_ZLIB, em->compress_type);
goto out;
}
@@ -643,31 +634,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* Split compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u,"
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
compressed_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
em->start, em->orig_start);
goto out;
}
if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
- test_msg("Unexpected compress type, wanted %d, got %d\n",
+ test_err("unexpected compress type, wanted %d, got %d",
BTRFS_COMPRESS_ZLIB, em->compress_type);
goto out;
}
@@ -678,25 +669,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -705,32 +696,32 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != disk_bytenr) {
- test_msg("Block start does not match, want %llu got %llu\n",
+ test_err("block start does not match, want %llu got %llu",
disk_bytenr, em->block_start);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 2 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != compressed_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
compressed_only, em->flags);
goto out;
}
if (em->orig_start != orig_start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n",
+ test_err("wrong orig offset, want %llu, have %llu",
em->start, orig_start);
goto out;
}
if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
- test_msg("Unexpected compress type, wanted %d, got %d\n",
+ test_err("unexpected compress type, wanted %d, got %d",
BTRFS_COMPRESS_ZLIB, em->compress_type);
goto out;
}
@@ -741,25 +732,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6,
sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -768,11 +759,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole extent, got %llu\n", em->block_start);
+ test_err("expected a hole extent, got %llu", em->block_start);
goto out;
}
/*
@@ -781,18 +772,18 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
* test.
*/
if (em->start != offset || em->len != 3 * sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, 3 * sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
- test_msg("Unexpected flags set, want %lu have %lu\n",
+ test_err("unexpected flags set, want %lu have %lu",
vacancy_only, em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -801,25 +792,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %llu len %u,"
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
offset, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
if (em->orig_start != em->start) {
- test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
em->orig_start);
goto out;
}
@@ -843,7 +834,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Couldn't allocate inode\n");
+ test_err("couldn't allocate inode");
return ret;
}
@@ -853,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
goto out;
}
root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
goto out;
}
@@ -884,21 +875,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
- test_msg("Expected a hole, got %llu\n", em->block_start);
+ test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != 0 || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start 0 len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start 0 len %u, got start %llu len %llu",
sectorsize, em->start, em->len);
goto out;
}
if (em->flags != vacancy_only) {
- test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+ test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
em->flags);
goto out;
}
@@ -907,21 +898,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize,
2 * sectorsize, 0);
if (IS_ERR(em)) {
- test_msg("Got an error when we shouldn't have\n");
+ test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != sectorsize) {
- test_msg("Expected a real extent, got %llu\n", em->block_start);
+ test_err("expected a real extent, got %llu", em->block_start);
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
- test_msg("Unexpected extent wanted start %u len %u, "
- "got start %llu len %llu\n",
+ test_err(
+ "unexpected extent wanted start %u len %u, got start %llu len %llu",
sectorsize, sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {
- test_msg("Unexpected flags set, wanted 0 got %lu\n",
+ test_err("unexpected flags set, wanted 0 got %lu",
em->flags);
goto out;
}
@@ -944,19 +935,19 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
inode = btrfs_new_test_inode();
if (!inode) {
- test_msg("Couldn't allocate inode\n");
+ test_err("couldn't allocate inode");
return ret;
}
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
goto out;
}
@@ -967,12 +958,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 1) {
ret = -EINVAL;
- test_msg("Miscount, wanted 1, got %u\n",
+ test_err("miscount, wanted 1, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -982,12 +973,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 2) {
ret = -EINVAL;
- test_msg("Miscount, wanted 2, got %u\n",
+ test_err("miscount, wanted 2, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -999,12 +990,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
EXTENT_DELALLOC | EXTENT_DIRTY |
EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
- test_msg("clear_extent_bit returned %d\n", ret);
+ test_err("clear_extent_bit returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 2) {
ret = -EINVAL;
- test_msg("Miscount, wanted 2, got %u\n",
+ test_err("miscount, wanted 2, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1015,12 +1006,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
+ sectorsize - 1,
0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 2) {
ret = -EINVAL;
- test_msg("Miscount, wanted 2, got %u\n",
+ test_err("miscount, wanted 2, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1033,12 +1024,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 4) {
ret = -EINVAL;
- test_msg("Miscount, wanted 4, got %u\n",
+ test_err("miscount, wanted 4, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1050,12 +1041,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 3) {
ret = -EINVAL;
- test_msg("Miscount, wanted 3, got %u\n",
+ test_err("miscount, wanted 3, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1067,12 +1058,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
- test_msg("clear_extent_bit returned %d\n", ret);
+ test_err("clear_extent_bit returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 4) {
ret = -EINVAL;
- test_msg("Miscount, wanted 4, got %u\n",
+ test_err("miscount, wanted 4, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1085,12 +1076,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0);
if (ret) {
- test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents != 3) {
ret = -EINVAL;
- test_msg("Miscount, wanted 3, got %u\n",
+ test_err("miscount, wanted 3, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1100,12 +1091,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
- test_msg("clear_extent_bit returned %d\n", ret);
+ test_err("clear_extent_bit returned %d", ret);
goto out;
}
if (BTRFS_I(inode)->outstanding_extents) {
ret = -EINVAL;
- test_msg("Miscount, wanted 0, got %u\n",
+ test_err("miscount, wanted 0, got %u",
BTRFS_I(inode)->outstanding_extents);
goto out;
}
@@ -1128,14 +1119,14 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
- test_msg("Running btrfs_get_extent tests\n");
+ test_msg("running btrfs_get_extent tests");
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
return ret;
- test_msg("Running hole first btrfs_get_extent test\n");
+ test_msg("running hole first btrfs_get_extent test");
ret = test_hole_first(sectorsize, nodesize);
if (ret)
return ret;
- test_msg("Running outstanding_extents tests\n");
+ test_msg("running outstanding_extents tests");
return test_extent_accounting(sectorsize, nodesize);
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 160eb2fba726..ace94db09d29 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Facebook. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/types.h>
@@ -37,7 +24,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
ins.objectid = bytenr;
ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -45,14 +32,14 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
- test_msg("Couldn't insert ref %d\n", ret);
+ test_err("couldn't insert ref %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -87,7 +74,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 refs;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -95,14 +82,14 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
- test_msg("Couldn't find extent ref\n");
+ test_err("couldn't find extent ref");
btrfs_free_path(path);
return ret;
}
@@ -124,7 +111,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
if (ret)
- test_msg("Failed to insert backref\n");
+ test_err("failed to insert backref");
btrfs_free_path(path);
return ret;
}
@@ -137,7 +124,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
struct btrfs_path *path;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -145,14 +132,14 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
- test_msg("Didn't find our key %d\n", ret);
+ test_err("didn't find our key %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -171,7 +158,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
u64 refs;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, NULL);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -179,14 +166,14 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
path = btrfs_alloc_path();
if (!path) {
- test_msg("Couldn't allocate path\n");
+ test_err("couldn't allocate path");
return -ENOMEM;
}
path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
- test_msg("Couldn't find extent ref\n");
+ test_err("couldn't find extent ref");
btrfs_free_path(path);
return ret;
}
@@ -208,7 +195,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
- test_msg("Couldn't find backref %d\n", ret);
+ test_err("couldn't find backref %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -226,12 +213,12 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
struct ulist *new_roots = NULL;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, fs_info);
- test_msg("Qgroup basic add\n");
+ test_msg("qgroup basic add");
ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID);
if (ret) {
- test_msg("Couldn't create a qgroup %d\n", ret);
+ test_err("couldn't create a qgroup %d", ret);
return ret;
}
@@ -244,7 +231,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -258,20 +245,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
old_roots = NULL;
@@ -281,7 +268,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -294,19 +281,19 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return -EINVAL;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -327,9 +314,9 @@ static int test_multiple_refs(struct btrfs_root *root,
struct ulist *new_roots = NULL;
int ret;
- btrfs_init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans, fs_info);
- test_msg("Qgroup multiple refs test\n");
+ test_msg("qgroup multiple refs test");
/*
* We have BTRFS_FS_TREE_OBJECTID created already from the
@@ -337,7 +324,7 @@ static int test_multiple_refs(struct btrfs_root *root,
*/
ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID);
if (ret) {
- test_msg("Couldn't create a qgroup %d\n", ret);
+ test_err("couldn't create a qgroup %d", ret);
return ret;
}
@@ -345,7 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -359,20 +346,20 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -380,7 +367,7 @@ static int test_multiple_refs(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -394,26 +381,26 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
nodesize, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -421,7 +408,7 @@ static int test_multiple_refs(struct btrfs_root *root,
false);
if (ret) {
ulist_free(old_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -435,26 +422,26 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
- test_msg("Couldn't find old roots: %d\n", ret);
+ test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize,
nodesize, old_roots, new_roots);
if (ret) {
- test_msg("Couldn't account space for a qgroup %d\n", ret);
+ test_err("couldn't account space for a qgroup %d", ret);
return ret;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
0, 0)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
- test_msg("Qgroup counts didn't match expected values\n");
+ test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
@@ -470,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ test_err("couldn't allocate dummy fs info");
return -ENOMEM;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ test_err("couldn't allocate root");
ret = PTR_ERR(root);
goto out;
}
@@ -498,7 +485,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
*/
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (!root->node) {
- test_msg("Couldn't allocate dummy buffer\n");
+ test_err("couldn't allocate dummy buffer");
ret = -ENOMEM;
goto out;
}
@@ -508,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
- test_msg("Couldn't allocate a fs root\n");
+ test_err("couldn't allocate a fs root");
ret = PTR_ERR(tmp_root);
goto out;
}
@@ -517,13 +504,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
root->fs_info->fs_root = tmp_root;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
- test_msg("Couldn't insert fs root %d\n", ret);
+ test_err("couldn't insert fs root %d", ret);
goto out;
}
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
- test_msg("Couldn't allocate a fs root\n");
+ test_err("couldn't allocate a fs root");
ret = PTR_ERR(tmp_root);
goto out;
}
@@ -531,11 +518,11 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID;
ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
if (ret) {
- test_msg("Couldn't insert fs root %d\n", ret);
+ test_err("couldn't insert fs root %d", ret);
goto out;
}
- test_msg("Running qgroup tests\n");
+ test_msg("running qgroup tests");
ret = test_no_shared_qgroup(root, sectorsize, nodesize);
if (ret)
goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5c4cf0f9146b..4485eae41e88 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
@@ -890,12 +877,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
- /*
- * Make sure counter is updated before we wake up waiters.
- */
- smp_mb();
- if (waitqueue_active(&cur_trans->writer_wait))
- wake_up(&cur_trans->writer_wait);
+ cond_wake_up(&cur_trans->writer_wait);
btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
@@ -1263,7 +1245,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
- btrfs_orphan_commit_root(trans, root);
btrfs_save_ino_cache(root, trans);
@@ -1653,15 +1634,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto fail;
}
- ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b,
- BTRFS_UUID_KEY_SUBVOL, objectid);
+ ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
+ objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info,
- new_root_item->received_uuid,
+ ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
if (ret && ret != -EEXIST) {
@@ -2280,6 +2260,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
+ clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b6c94ce33503..94439482a0ec 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_TRANSACTION__
-#define __BTRFS_TRANSACTION__
+#ifndef BTRFS_TRANSACTION_H
+#define BTRFS_TRANSACTION_H
#include <linux/refcount.h>
#include "btrfs_inode.h"
@@ -152,7 +139,6 @@ struct btrfs_pending_snapshot {
struct btrfs_path *path;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
- u64 qgroup_reserved;
/* extra metadata reservation for relocation */
int error;
bool readonly;
@@ -212,6 +198,20 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
+
+/*
+ * Try to commit transaction asynchronously, so this is safe to call
+ * even holding a spinlock.
+ *
+ * It's done by informing transaction_kthread to commit transaction without
+ * waiting for commit interval.
+ */
+static inline void btrfs_commit_transaction_locksafe(
+ struct btrfs_fs_info *fs_info)
+{
+ set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
+ wake_up_process(fs_info->transaction_kthread);
+}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
@@ -228,4 +228,5 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+
#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 8871286c1a91..8d40e7dd8c30 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1,17 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) Qu Wenruo 2017. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program.
*/
/*
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index aba542755710..ff043275b784 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -1,21 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) Qu Wenruo 2017. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program.
*/
-#ifndef __BTRFS_TREE_CHECKER__
-#define __BTRFS_TREE_CHECKER__
+#ifndef BTRFS_TREE_CHECKER_H
+#define BTRFS_TREE_CHECKER_H
#include "ctree.h"
#include "extent_io.h"
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index c09dbe4bd6e7..3c0987ab587d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c91babc6aa4b..f8220ec02036 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
@@ -235,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&root->log_writer_wait))
- wake_up(&root->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&root->log_writer_wait);
}
}
@@ -2352,8 +2336,10 @@ again:
nritems = btrfs_header_nritems(path->nodes[0]);
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
- if (ret)
+ if (ret == 1)
break;
+ else if (ret < 0)
+ goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
@@ -2456,13 +2442,41 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (ret)
break;
- /* for regular files, make sure corresponding
- * orphan item exist. extents past the new EOF
- * will be truncated later by orphan cleanup.
+ /*
+ * Before replaying extents, truncate the inode to its
+ * size. We need to do it now and not after log replay
+ * because before an fsync we can have prealloc extents
+ * added beyond the inode's i_size. If we did it after,
+ * through orphan cleanup for example, we would drop
+ * those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
- ret = insert_orphan_item(wc->trans, root,
- key.objectid);
+ struct inode *inode;
+ u64 from;
+
+ inode = read_one_inode(root, key.objectid);
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
+ from = ALIGN(i_size_read(inode),
+ root->fs_info->sectorsize);
+ ret = btrfs_drop_extents(wc->trans, root, inode,
+ from, (u64)-1, 1);
+ /*
+ * If the nlink count is zero here, the iput
+ * will free the inode. We bump it to make
+ * sure it doesn't get freed until the link
+ * count fixup is done.
+ */
+ if (!ret) {
+ if (inode->i_nlink == 0)
+ inc_nlink(inode);
+ /* Update link count and nbytes. */
+ ret = btrfs_update_inode(wc->trans,
+ root, inode);
+ }
+ iput(inode);
if (ret)
break;
}
@@ -2971,11 +2985,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /*
- * Implicit memory barrier after atomic_dec_and_test
- */
- if (waitqueue_active(&log_root_tree->log_writer_wait))
- wake_up(&log_root_tree->log_writer_wait);
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&log_root_tree->log_writer_wait);
}
if (ret) {
@@ -3099,10 +3110,11 @@ out_wake_log_root:
mutex_unlock(&log_root_tree->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
- wake_up(&log_root_tree->log_commit_wait[index2]);
+ cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
mutex_lock(&root->log_mutex);
btrfs_remove_all_log_ctxs(root, index1, ret);
@@ -3111,10 +3123,11 @@ out:
mutex_unlock(&root->log_mutex);
/*
- * The barrier before waitqueue_active is implied by mutex_unlock
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
*/
- if (waitqueue_active(&root->log_commit_wait[index1]))
- wake_up(&root->log_commit_wait[index1]);
+ cond_wake_up(&root->log_commit_wait[index1]);
return ret;
}
@@ -3520,8 +3533,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* from this directory and from this transaction
*/
ret = btrfs_next_leaf(root, path);
- if (ret == 1) {
- last_offset = (u64)-1;
+ if (ret) {
+ if (ret == 1)
+ last_offset = (u64)-1;
+ else
+ err = ret;
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -4300,6 +4316,110 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * Log all prealloc extents beyond the inode's i_size to make sure we do not
+ * lose them after doing a fast fsync and replaying the log. We scan the
+ * subvolume's root instead of iterating the inode's extent map tree because
+ * otherwise we can log incorrect extent items based on extent map conversion.
+ * That can happen due to the fact that extent maps are merged when they
+ * are not in the extent map tree's list of modified extents.
+ */
+static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key;
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *dst_path = NULL;
+ u64 last_extent = (u64)-1;
+ int ins_nr = 0;
+ int start_slot;
+ int ret;
+
+ if (!(inode->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = i_size;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ if (ret < 0)
+ goto out;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY ||
+ key.offset < i_size) {
+ path->slots[0]++;
+ continue;
+ }
+ if (last_extent == (u64)-1) {
+ last_extent = key.offset;
+ /*
+ * Avoid logging extent items logged in past fsync calls
+ * and leading to duplicate keys in the log tree.
+ */
+ do {
+ ret = btrfs_truncate_inode_items(trans,
+ root->log_root,
+ &inode->vfs_inode,
+ i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ } while (ret == -EAGAIN);
+ if (ret)
+ goto out;
+ }
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ if (!dst_path) {
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ start_slot, ins_nr, 1, 0);
+ if (ret > 0)
+ ret = 0;
+ }
+out:
+ btrfs_release_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
+}
+
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
@@ -4342,6 +4462,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
if (em->generation <= test_gen)
continue;
+ /* We log prealloc extents beyond eof later. */
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ em->start >= i_size_read(&inode->vfs_inode))
+ continue;
+
if (em->start < logged_start)
logged_start = em->start;
if ((em->start + em->len - 1) > logged_end)
@@ -4398,6 +4523,9 @@ process:
up_write(&inode->dio_sem);
btrfs_release_path(path);
+ if (!ret)
+ ret = btrfs_log_prealloc_extents(trans, inode, path);
+
return ret;
}
@@ -4782,6 +4910,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct extent_map_tree *em_tree = &inode->extent_tree;
u64 logged_isize = 0;
bool need_log_inode_item = true;
+ bool xattrs_logged = false;
path = btrfs_alloc_path();
if (!path)
@@ -5083,6 +5212,7 @@ next_key:
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
if (err)
goto out_unlock;
+ xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -5095,6 +5225,11 @@ log_extents:
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode);
+ if (!err && !xattrs_logged) {
+ err = btrfs_log_all_xattrs(trans, root, inode, path,
+ dst_path);
+ btrfs_release_path(path);
+ }
if (err)
goto out_unlock;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 88abc43312a1..122e68b89a5a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __TREE_LOG_
-#define __TREE_LOG_
+#ifndef BTRFS_TREE_LOG_H
+#define BTRFS_TREE_LOG_H
#include "ctree.h"
#include "transaction.h"
@@ -87,4 +74,5 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
struct dentry *parent);
+
#endif
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index d8edf164f81c..3374c9e9be67 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 STRATO AG
* written by Arne Jansen <sensille@gmx.net>
- * Distributed under the GNU GPL license version 2.
*/
#include <linux/slab.h>
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 53c913632733..02fda0a2d4ce 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -1,12 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2011 STRATO AG
* written by Arne Jansen <sensille@gmx.net>
- * Distributed under the GNU GPL license version 2.
- *
*/
-#ifndef __ULIST__
-#define __ULIST__
+#ifndef BTRFS_ULIST_H
+#define BTRFS_ULIST_H
#include <linux/list.h>
#include <linux/rbtree.h>
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 9916f03430bc..3b2ae342e649 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) STRATO AG 2013. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+
#include <linux/uuid.h>
#include <asm/unaligned.h>
#include "ctree.h"
@@ -91,10 +79,10 @@ out:
return ret;
}
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid_cpu)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
@@ -156,10 +144,10 @@ out:
return ret;
}
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
@@ -251,7 +239,7 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
goto out;
}
- ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid);
+ ret = btrfs_uuid_tree_remove(trans, uuid, type, subid);
btrfs_end_transaction(trans);
out:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 93f8f17cacca..e034ad9e23b4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
+
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
@@ -52,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .raid_name = "raid10",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
+ .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
},
[BTRFS_RAID_RAID1] = {
.sub_stripes = 1,
@@ -61,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
+ .raid_name = "raid1",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
@@ -70,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 2,
+ .raid_name = "dup",
+ .bg_flag = BTRFS_BLOCK_GROUP_DUP,
+ .mindev_error = 0,
},
[BTRFS_RAID_RAID0] = {
.sub_stripes = 1,
@@ -79,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .raid_name = "raid0",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
+ .mindev_error = 0,
},
[BTRFS_RAID_SINGLE] = {
.sub_stripes = 1,
@@ -88,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
+ .raid_name = "single",
+ .bg_flag = 0,
+ .mindev_error = 0,
},
[BTRFS_RAID_RAID5] = {
.sub_stripes = 1,
@@ -97,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 1,
.devs_increment = 1,
.ncopies = 2,
+ .raid_name = "raid5",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
+ .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
},
[BTRFS_RAID_RAID6] = {
.sub_stripes = 1,
@@ -106,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 2,
.devs_increment = 1,
.ncopies = 3,
+ .raid_name = "raid6",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
+ .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
},
};
-const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
- [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
- [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
- [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
- [BTRFS_RAID_SINGLE] = 0,
- [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
- [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
-};
+const char *get_raid_name(enum btrfs_raid_types type)
+{
+ if (type >= BTRFS_NR_RAID_TYPES)
+ return NULL;
-/*
- * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
- * condition is not met. Zero means there's no corresponding
- * BTRFS_ERROR_DEV_*_NOT_MET value.
- */
-const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
- [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
- [BTRFS_RAID_DUP] = 0,
- [BTRFS_RAID_RAID0] = 0,
- [BTRFS_RAID_SINGLE] = 0,
- [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
- [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
-};
+ return btrfs_raid_array[type].raid_name;
+}
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
@@ -179,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
- * volume_mutex
- * ------------
- * coarse lock owned by a mounted filesystem; used to exclude some operations
- * that cannot run in parallel and affect the higher-level properties of the
- * filesystem like: device add/deleting/resize/replace, or balance
- *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -209,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* device_list_mutex
* chunk_mutex
* balance_mutex
+ *
+ *
+ * Exclusive operations, BTRFS_FS_EXCL_OP
+ * ======================================
+ *
+ * Maintains the exclusivity of the following operations that apply to the
+ * whole filesystem and cannot run in parallel.
+ *
+ * - Balance (*)
+ * - Device add
+ * - Device remove
+ * - Device replace (*)
+ * - Resize
+ *
+ * The device operations (as above) can be in one of the following states:
+ *
+ * - Running state
+ * - Paused state
+ * - Completed state
+ *
+ * Only device operations marked with (*) can go into the Paused state for the
+ * following reasons:
+ *
+ * - ioctl (only Balance can be Paused through ioctl)
+ * - filesystem remounted as read-only
+ * - filesystem unmounted and mounted as read-only
+ * - system power-cycle and filesystem mounted as read-only
+ * - filesystem or device errors leading to forced read-only
+ *
+ * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
+ * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * A device operation in Paused or Running state can be canceled or resumed
+ * either by ioctl (Balance only) or when remounted as read-write.
+ * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * completed.
*/
DEFINE_MUTEX(uuid_mutex);
@@ -239,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
- INIT_LIST_HEAD(&fs_devs->list);
+ INIT_LIST_HEAD(&fs_devs->fs_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
return fs_devs;
}
-static void free_device(struct btrfs_device *device)
+void btrfs_free_device(struct btrfs_device *device)
{
rcu_string_free(device->name);
bio_put(device->flush_bio);
@@ -261,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- free_device(device);
+ btrfs_free_device(device);
}
kfree(fs_devices);
}
@@ -285,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
while (!list_empty(&fs_uuids)) {
fs_devices = list_entry(fs_uuids.next,
- struct btrfs_fs_devices, list);
- list_del(&fs_devices->list);
+ struct btrfs_fs_devices, fs_list);
+ list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
@@ -294,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
/*
* Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
* Returned struct is not linked onto any lists and must be destroyed using
- * free_device.
+ * btrfs_free_device.
*/
static struct btrfs_device *__alloc_device(void)
{
@@ -339,10 +360,9 @@ static struct btrfs_device *__alloc_device(void)
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, const u8 *uuid)
{
- struct list_head *head = &fs_devices->devices;
struct btrfs_device *dev;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
if (dev->devid == devid &&
(!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
return dev;
@@ -355,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
- list_for_each_entry(fs_devices, &fs_uuids, list) {
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
@@ -619,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path,
struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
struct btrfs_device *dev, *tmp_dev;
- list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
+ list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
if (fs_devs->opened)
continue;
@@ -644,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path,
/* delete the stale device */
if (fs_devs->num_devices == 1) {
btrfs_sysfs_remove_fsid(fs_devs);
- list_del(&fs_devs->list);
+ list_del(&fs_devs->fs_list);
free_fs_devices(fs_devs);
break;
} else {
fs_devs->num_devices--;
list_del(&dev->dev_list);
- free_device(dev);
+ btrfs_free_device(dev);
}
}
}
@@ -744,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
- list_add(&fs_devices->list, &fs_uuids);
+ list_add(&fs_devices->fs_list, &fs_uuids);
device = NULL;
} else {
@@ -765,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
- free_device(device);
+ btrfs_free_device(device);
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
@@ -878,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
- free_device(device);
+ btrfs_free_device(device);
goto error;
}
rcu_assign_pointer(device->name, name);
@@ -950,7 +970,7 @@ again:
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- free_device(device);
+ btrfs_free_device(device);
}
if (fs_devices->seed) {
@@ -968,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head)
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
- free_device(device);
+ btrfs_free_device(device);
}
static void btrfs_close_bdev(struct btrfs_device *device)
@@ -1017,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
new_device->fs_devices = device->fs_devices;
}
-static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
struct list_head pending_put;
@@ -1062,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
int ret;
mutex_lock(&uuid_mutex);
- ret = __btrfs_close_devices(fs_devices);
+ ret = close_fs_devices(fs_devices);
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
@@ -1072,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
while (seed_devices) {
fs_devices = seed_devices;
seed_devices = fs_devices->seed;
- __btrfs_close_devices(fs_devices);
+ close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
return ret;
}
-static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
- struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
int ret = 0;
flags |= FMODE_EXCL;
- list_for_each_entry(device, head, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
/* Just open everything we can; ignore failures here */
if (btrfs_open_one_device(fs_devices, device, flags, holder))
continue;
@@ -1127,15 +1146,16 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
{
int ret;
- mutex_lock(&uuid_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
} else {
list_sort(NULL, &fs_devices->devices, devid_cmp);
- ret = __btrfs_open_devices(fs_devices, flags, holder);
+ ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
return ret;
}
@@ -1213,31 +1233,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
*/
bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
- mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
- if (IS_ERR(bdev)) {
- ret = PTR_ERR(bdev);
- goto error;
- }
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
ret = -EINVAL;
goto error_bdev_put;
}
+ mutex_lock(&uuid_mutex);
device = device_list_add(path, disk_super);
if (IS_ERR(device))
ret = PTR_ERR(device);
else
*fs_devices_ret = device->fs_devices;
+ mutex_unlock(&uuid_mutex);
btrfs_release_disk_super(page);
error_bdev_put:
blkdev_put(bdev, flags);
-error:
- mutex_unlock(&uuid_mutex);
+
return ret;
}
@@ -1869,11 +1887,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
} while (read_seqretry(&fs_info->profiles_lock, seq));
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
- if (!(all_avail & btrfs_raid_group[i]))
+ if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
if (num_devices < btrfs_raid_array[i].devs_min) {
- int ret = btrfs_raid_mindev_error[i];
+ int ret = btrfs_raid_array[i].mindev_error;
if (ret)
return ret;
@@ -1929,13 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&uuid_mutex);
- num_devices = fs_info->fs_devices->num_devices;
+ num_devices = fs_devices->num_devices;
btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
@@ -1998,27 +2016,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* (super_copy) should hold the device list mutex.
*/
+ /*
+ * In normal cases the cur_devices == fs_devices. But in case
+ * of deleting a seed device, the cur_devices should point to
+ * its own fs_devices listed under the fs_devices->seed.
+ */
cur_devices = device->fs_devices;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
- device->fs_devices->num_devices--;
- device->fs_devices->total_devices--;
+ cur_devices->num_devices--;
+ cur_devices->total_devices--;
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
- device->fs_devices->missing_devices--;
+ cur_devices->missing_devices--;
btrfs_assign_next_active_device(fs_info, device, NULL);
if (device->bdev) {
- device->fs_devices->open_devices--;
+ cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(fs_devices, device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/*
* at this point, the device is zero sized and detached from
@@ -2032,8 +2055,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
call_rcu(&device->rcu, free_device_rcu);
if (cur_devices->open_devices == 0) {
- struct btrfs_fs_devices *fs_devices;
- fs_devices = fs_info->fs_devices;
while (fs_devices) {
if (fs_devices->seed == cur_devices) {
fs_devices->seed = cur_devices->seed;
@@ -2042,20 +2063,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
- __btrfs_close_devices(cur_devices);
+ close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
out:
mutex_unlock(&uuid_mutex);
- mutex_unlock(&fs_info->volume_mutex);
return ret;
error_undo:
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
- &fs_info->fs_devices->alloc_list);
+ &fs_devices->alloc_list);
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
@@ -2124,7 +2144,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
tmp_fs_devices = tmp_fs_devices->seed;
}
fs_devices->seed = NULL;
- __btrfs_close_devices(fs_devices);
+ close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
}
@@ -2132,23 +2152,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
- mutex_lock(&uuid_mutex);
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+
WARN_ON(!tgtdev);
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
+ btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
if (tgtdev->bdev)
- fs_info->fs_devices->open_devices--;
+ fs_devices->open_devices--;
- fs_info->fs_devices->num_devices--;
+ fs_devices->num_devices--;
btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
list_del_rcu(&tgtdev->dev_list);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- mutex_unlock(&uuid_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/*
* The update_dev_time() with in btrfs_scratch_superblocks()
@@ -2200,10 +2220,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
struct btrfs_device *tmp;
devices = &fs_info->fs_devices->devices;
- /*
- * It is safe to read the devices since the volume_mutex
- * is held by the caller.
- */
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
@@ -2271,7 +2287,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
return PTR_ERR(old_devices);
}
- list_add(&old_devices->list, &fs_uuids);
+ list_add(&old_devices->fs_list, &fs_uuids);
memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
seed_devices->opened = 1;
@@ -2582,7 +2598,7 @@ error_trans:
if (trans)
btrfs_end_transaction(trans);
error_free_device:
- free_device(device);
+ btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev && !unlocked) {
@@ -2592,99 +2608,6 @@ error:
return ret;
}
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device *srcdev,
- struct btrfs_device **device_out)
-{
- struct btrfs_device *device;
- struct block_device *bdev;
- struct list_head *devices;
- struct rcu_string *name;
- u64 devid = BTRFS_DEV_REPLACE_DEVID;
- int ret = 0;
-
- *device_out = NULL;
- if (fs_info->fs_devices->seeding) {
- btrfs_err(fs_info, "the filesystem is a seed filesystem!");
- return -EINVAL;
- }
-
- bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
- fs_info->bdev_holder);
- if (IS_ERR(bdev)) {
- btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev);
- }
-
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
-
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
- if (device->bdev == bdev) {
- btrfs_err(fs_info,
- "target device is in the filesystem!");
- ret = -EEXIST;
- goto error;
- }
- }
-
-
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
- btrfs_err(fs_info,
- "target device is smaller than source device!");
- ret = -EINVAL;
- goto error;
- }
-
-
- device = btrfs_alloc_device(NULL, &devid, NULL);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
- }
-
- name = rcu_string_strdup(device_path, GFP_KERNEL);
- if (!name) {
- free_device(device);
- ret = -ENOMEM;
- goto error;
- }
- rcu_assign_pointer(device->name, name);
-
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- device->generation = 0;
- device->io_width = fs_info->sectorsize;
- device->io_align = fs_info->sectorsize;
- device->sector_size = fs_info->sectorsize;
- device->total_bytes = btrfs_device_get_total_bytes(srcdev);
- device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
- device->bytes_used = btrfs_device_get_bytes_used(srcdev);
- device->commit_total_bytes = srcdev->commit_total_bytes;
- device->commit_bytes_used = device->bytes_used;
- device->fs_info = fs_info;
- device->bdev = bdev;
- set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->mode = FMODE_EXCL;
- device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
- *device_out = device;
- return ret;
-
-error:
- blkdev_put(bdev, FMODE_EXCL);
- return ret;
-}
-
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
@@ -3285,24 +3208,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
}
/*
- * Should be called with both balance and volume mutexes held to
- * serialize other volume operations (add_dev/rm_dev/resize) with
- * restriper. Same goes for unset_balance_control.
+ * Clear the balance status in fs_info and delete the balance item from disk.
*/
-static void set_balance_control(struct btrfs_balance_control *bctl)
-{
- struct btrfs_fs_info *fs_info = bctl->fs_info;
-
- BUG_ON(fs_info->balance_ctl);
-
- spin_lock(&fs_info->balance_lock);
- fs_info->balance_ctl = bctl;
- spin_unlock(&fs_info->balance_lock);
-}
-
-static void unset_balance_control(struct btrfs_fs_info *fs_info)
+static void reset_balance_state(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ int ret;
BUG_ON(!fs_info->balance_ctl);
@@ -3311,6 +3222,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
kfree(bctl);
+ ret = del_balance_item(fs_info);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret, NULL);
}
/*
@@ -3847,18 +3761,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
atomic_read(&fs_info->balance_cancel_req) == 0);
}
-static void __cancel_balance(struct btrfs_fs_info *fs_info)
-{
- int ret;
-
- unset_balance_control(fs_info);
- ret = del_balance_item(fs_info);
- if (ret)
- btrfs_handle_fs_error(fs_info, ret, NULL);
-
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-}
-
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
u64 allowed)
@@ -3869,12 +3771,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
}
/*
- * Should be called with both balance and volume mutexes held
+ * Should be called with balance mutexe held
*/
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
- struct btrfs_fs_info *fs_info = bctl->fs_info;
u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
@@ -3903,7 +3805,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
btrfs_err(fs_info,
- "with mixed groups data and metadata balance options must be the same");
+ "balance: mixed groups data and metadata options must be the same");
ret = -EINVAL;
goto out;
}
@@ -3925,23 +3827,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
if (validate_convert_profile(&bctl->data, allowed)) {
+ int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
+
btrfs_err(fs_info,
- "unable to start balance with target data profile %llu",
- bctl->data.target);
+ "balance: invalid convert data profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->meta, allowed)) {
+ int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
+
btrfs_err(fs_info,
- "unable to start balance with target metadata profile %llu",
- bctl->meta.target);
+ "balance: invalid convert metadata profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->sys, allowed)) {
+ int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
+
btrfs_err(fs_info,
- "unable to start balance with target system profile %llu",
- bctl->sys.target);
+ "balance: invalid convert system profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
@@ -3962,10 +3870,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
!(bctl->meta.target & allowed))) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
- "force reducing metadata integrity");
+ "balance: force reducing metadata integrity");
} else {
btrfs_err(fs_info,
- "balance will reduce metadata integrity, use force if you want this");
+ "balance: reduces metadata integrity, use --force if you want this");
ret = -EINVAL;
goto out;
}
@@ -3979,9 +3887,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
bctl->data.target : fs_info->avail_data_alloc_bits;
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
+ int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
+ int data_index = btrfs_bg_flags_to_raid_index(data_target);
+
btrfs_warn(fs_info,
- "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
- meta_target, data_target);
+ "balance: metadata profile %s has lower redundancy than data profile %s",
+ get_raid_name(meta_index), get_raid_name(data_index));
}
ret = insert_balance_item(fs_info, bctl);
@@ -3990,7 +3901,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
BUG_ON(ret == -EEXIST);
- set_balance_control(bctl);
+ BUG_ON(fs_info->balance_ctl);
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
} else {
BUG_ON(ret != -EEXIST);
spin_lock(&fs_info->balance_lock);
@@ -3998,22 +3912,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
spin_unlock(&fs_info->balance_lock);
}
- atomic_inc(&fs_info->balance_running);
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
- atomic_dec(&fs_info->balance_running);
+ clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
if (bargs) {
memset(bargs, 0, sizeof(*bargs));
- update_ioctl_balance_args(fs_info, 0, bargs);
+ btrfs_update_ioctl_balance_args(fs_info, bargs);
}
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
- __cancel_balance(fs_info);
+ reset_balance_state(fs_info);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
wake_up(&fs_info->balance_wait_q);
@@ -4021,11 +3937,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
return ret;
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
- __cancel_balance(fs_info);
- else {
+ reset_balance_state(fs_info);
+ else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
- }
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
return ret;
}
@@ -4034,16 +3950,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
-
if (fs_info->balance_ctl) {
- btrfs_info(fs_info, "continuing balance");
- ret = btrfs_balance(fs_info->balance_ctl, NULL);
+ btrfs_info(fs_info, "balance: resuming");
+ ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
}
-
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
return ret;
}
@@ -4052,18 +3964,27 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
- spin_lock(&fs_info->balance_lock);
+ mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
- spin_unlock(&fs_info->balance_lock);
+ mutex_unlock(&fs_info->balance_mutex);
return 0;
}
- spin_unlock(&fs_info->balance_lock);
+ mutex_unlock(&fs_info->balance_mutex);
if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
- btrfs_info(fs_info, "force skipping balance");
+ btrfs_info(fs_info, "balance: resume skipped");
return 0;
}
+ /*
+ * A ro->rw remount sequence should continue with the paused balance
+ * regardless of who pauses it, system or the user as of now, so set
+ * the resume flag.
+ */
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
return PTR_ERR_OR_ZERO(tsk);
}
@@ -4103,7 +4024,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
- bctl->fs_info = fs_info;
bctl->flags = btrfs_balance_flags(leaf, item);
bctl->flags |= BTRFS_BALANCE_RESUME;
@@ -4114,15 +4034,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+ /*
+ * This should never happen, as the paused balance state is recovered
+ * during mount without any chance of other exclusive ops to collide.
+ *
+ * This gives the exclusive op status to balance and keeps in paused
+ * state until user intervention (cancel or umount). If the ownership
+ * cannot be assigned, show a message but do not fail. The balance
+ * is in a paused state and must have fs_info::balance_ctl properly
+ * set up.
+ */
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ btrfs_warn(fs_info,
+ "balance: cannot set exclusive op status, resume manually");
- mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
-
- set_balance_control(bctl);
-
+ BUG_ON(fs_info->balance_ctl);
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
- mutex_unlock(&fs_info->volume_mutex);
out:
btrfs_free_path(path);
return ret;
@@ -4138,16 +4069,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
return -ENOTCONN;
}
- if (atomic_read(&fs_info->balance_running)) {
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
atomic_inc(&fs_info->balance_pause_req);
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
- atomic_read(&fs_info->balance_running) == 0);
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
/* we are good with balance_ctl ripped off from under us */
- BUG_ON(atomic_read(&fs_info->balance_running));
+ BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_pause_req);
} else {
ret = -ENOTCONN;
@@ -4159,38 +4090,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
- if (sb_rdonly(fs_info->sb))
- return -EROFS;
-
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
+ /*
+ * A paused balance with the item stored on disk can be resumed at
+ * mount time if the mount is read-write. Otherwise it's still paused
+ * and we must not allow cancelling as it deletes the item.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -EROFS;
+ }
+
atomic_inc(&fs_info->balance_cancel_req);
/*
* if we are running just wait and return, balance item is
* deleted in btrfs_balance in this case
*/
- if (atomic_read(&fs_info->balance_running)) {
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
- atomic_read(&fs_info->balance_running) == 0);
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
} else {
- /* __cancel_balance needs volume_mutex */
mutex_unlock(&fs_info->balance_mutex);
- mutex_lock(&fs_info->volume_mutex);
+ /*
+ * Lock released to allow other waiters to continue, we'll
+ * reexamine the status again.
+ */
mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl)
- __cancel_balance(fs_info);
-
- mutex_unlock(&fs_info->volume_mutex);
+ if (fs_info->balance_ctl) {
+ reset_balance_state(fs_info);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_info(fs_info, "balance: canceled");
+ }
}
- BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ BUG_ON(fs_info->balance_ctl ||
+ test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
@@ -4267,8 +4209,7 @@ static int btrfs_uuid_scan_kthread(void *data)
}
update_tree:
if (!btrfs_is_empty_uuid(root_item.uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info,
- root_item.uuid,
+ ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
if (ret < 0) {
@@ -4279,7 +4220,7 @@ update_tree:
}
if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info,
+ ret = btrfs_uuid_tree_add(trans,
root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
@@ -4485,7 +4426,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
+ path->reada = READA_BACK;
mutex_lock(&fs_info->chunk_mutex);
@@ -6046,9 +5987,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
- u64 chunk_start, u64 physical, u64 devid,
- u64 **logical, int *naddrs, int *stripe_len)
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6080,8 +6020,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
- if (devid && map->stripes[i].dev->devid != devid)
- continue;
if (map->stripes[i].physical > physical ||
map->stripes[i].physical + length <= physical)
continue;
@@ -6413,7 +6351,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
- * destroyed with free_device.
+ * destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
@@ -6436,7 +6374,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
ret = find_next_devid(fs_info, &tmp);
if (ret) {
- free_device(dev);
+ btrfs_free_device(dev);
return ERR_PTR(ret);
}
}
@@ -6687,8 +6625,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = __btrfs_open_devices(fs_devices, FMODE_READ,
- fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(ret);
@@ -6696,7 +6633,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
}
if (!fs_devices->seeding) {
- __btrfs_close_devices(fs_devices);
+ close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(-EINVAL);
goto out;
@@ -7005,6 +6942,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
+ /*
+ * uuid_mutex is needed only if we are mounting a sprout FS
+ * otherwise we don't need it.
+ */
mutex_lock(&uuid_mutex);
mutex_lock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d1fcaea9fef5..5139ec8daf4c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __BTRFS_VOLUMES_
-#define __BTRFS_VOLUMES_
+#ifndef BTRFS_VOLUMES_H
+#define BTRFS_VOLUMES_H
#include <linux/bio.h>
#include <linux/sort.h>
@@ -221,6 +208,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ struct list_head fs_list;
u64 num_devices;
u64 open_devices;
@@ -242,7 +230,6 @@ struct btrfs_fs_devices {
struct list_head resized_devices;
/* devices not currently being allocated */
struct list_head alloc_list;
- struct list_head list;
struct btrfs_fs_devices *seed;
int seeding;
@@ -342,11 +329,12 @@ struct btrfs_raid_attr {
int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
+ int mindev_error; /* error code if min devs requisite is unmet */
+ const char raid_name[8]; /* name of the raid */
+ u64 bg_flag; /* block group flag of the raid */
};
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
-extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
struct map_lookup {
u64 type;
@@ -364,8 +352,6 @@ struct map_lookup {
struct btrfs_balance_args;
struct btrfs_balance_progress;
struct btrfs_balance_control {
- struct btrfs_fs_info *fs_info;
-
struct btrfs_balance_args data;
struct btrfs_balance_args meta;
struct btrfs_balance_args sys;
@@ -406,9 +392,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
- u64 chunk_start, u64 physical, u64 devid,
- u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -434,6 +419,7 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
const char *device_path, u64 devid);
void __exit btrfs_cleanup_fs_uuids(void);
@@ -444,11 +430,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
-int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device *srcdev,
- struct btrfs_device **device_out);
-int btrfs_balance(struct btrfs_balance_control *bctl,
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
@@ -566,6 +549,8 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
+const char *get_raid_name(enum btrfs_raid_types type);
+
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index e1e8177deb5e..ea78c3d6dcfc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Red Hat. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
#include <linux/init.h>
@@ -32,7 +19,6 @@
#include "props.h"
#include "locking.h"
-
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size)
{
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index e215a3212a2a..471fcac6ff55 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Red Hat. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
*/
-#ifndef __XATTR__
-#define __XATTR__
+#ifndef BTRFS_XATTR_H
+#define BTRFS_XATTR_H
#include <linux/xattr.h>
@@ -34,4 +21,4 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr);
-#endif /* __XATTR__ */
+#endif
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 2b52950dc2c6..970ff3e35bb3 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
* Based on jffs2 zlib code:
* Copyright © 2001-2007 Red Hat, Inc.
* Created by David Woodhouse <dwmw2@infradead.org>
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 01a4eab602a3..af6ec59972f5 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -1,16 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
+
#include <linux/bio.h>
#include <linux/err.h>
#include <linux/init.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index 9a73924db22f..cabc045f483d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* we get exclusion from try_to_free_buffers with the blockdev mapping's
* private_lock.
*
- * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
+ * Hack idea: for the blockdev mapping, private_lock contention
* may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take private_lock. (But if
- * private_lock is contended then so is mapping->tree_lock).
+ * succeeds, there is no need to take private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
@@ -495,35 +494,12 @@ repeat:
return err;
}
-static void do_thaw_one(struct super_block *sb, void *unused)
+void emergency_thaw_bdev(struct super_block *sb)
{
while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
-static void do_thaw_all(struct work_struct *work)
-{
- iterate_supers(do_thaw_one, NULL);
- kfree(work);
- printk(KERN_WARNING "Emergency Thaw complete\n");
-}
-
-/**
- * emergency_thaw_all -- forcibly thaw every frozen filesystem
- *
- * Used for emergency unfreeze of all filesystems via SysRq
- */
-void emergency_thaw_all(void)
-{
- struct work_struct *work;
-
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- INIT_WORK(work, do_thaw_all);
- schedule_work(work);
- }
-}
-
/**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
@@ -594,20 +570,21 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* The caller must hold lock_page_memcg().
*/
-static void __set_page_dirty(struct page *page, struct address_space *mapping,
+void __set_page_dirty(struct page *page, struct address_space *mapping,
int warn)
{
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
+ radix_tree_tag_set(&mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
}
+EXPORT_SYMBOL_GPL(__set_page_dirty);
/*
* Add a page to the dirty page list.
@@ -1095,7 +1072,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and mapping->host->i_lock.
+ * i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
{
@@ -1511,7 +1488,7 @@ void block_invalidatepage(struct page *page, unsigned int offset,
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (offset == 0)
+ if (length == PAGE_SIZE)
try_to_release_page(page, 0);
out:
return;
@@ -3450,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
- */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
-{
- loff_t offset = page_offset(page);
- struct buffer_head *bh, *head;
- bool seek_data = whence == SEEK_DATA;
-
- if (lastoff < offset)
- lastoff = offset;
-
- bh = head = page_buffers(page);
- do {
- offset += bh->b_size;
- if (lastoff >= offset)
- continue;
-
- /*
- * Unwritten extents that have data in the page cache covering
- * them can be identified by the BH_Unwritten state flag.
- * Pages with multiple buffers might have a mix of holes, data
- * and unwritten extents - any buffer with valid data in it
- * should have BH_Uptodate flag set on it.
- */
-
- if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
- return lastoff;
-
- lastoff = offset;
- } while ((bh = bh->b_this_page) != head);
- return -ENOENT;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
- int whence)
-{
- pgoff_t index = offset >> PAGE_SHIFT;
- pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
- loff_t lastoff = offset;
- struct pagevec pvec;
-
- if (length <= 0)
- return -ENOENT;
-
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
- end - 1);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
-
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- *
- * If current page offset is beyond where we've ended,
- * we've found a hole.
- */
- if (whence == SEEK_HOLE &&
- lastoff < page_offset(page))
- goto check_range;
-
- lock_page(page);
- if (likely(page->mapping == inode->i_mapping) &&
- page_has_buffers(page)) {
- lastoff = page_seek_hole_data(page, lastoff, whence);
- if (lastoff >= 0) {
- unlock_page(page);
- goto check_range;
- }
- }
- unlock_page(page);
- lastoff = page_offset(page) + PAGE_SIZE;
- }
- pagevec_release(&pvec);
- } while (index < end);
-
- /* When no page at lastoff and we are not done, we found a hole. */
- if (whence != SEEK_HOLE)
- goto not_found;
-
-check_range:
- if (lastoff < offset + length)
- goto out;
-not_found:
- lastoff = -ENOENT;
-out:
- pagevec_release(&pvec);
- return lastoff;
-}
-
void __init buffer_init(void)
{
unsigned long nrpages;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index e7f16a77a22a..222bc5d8b62c 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -32,7 +32,7 @@ static struct fscache_object *cachefiles_alloc_object(
struct cachefiles_cache *cache;
struct cachefiles_xattr *auxdata;
unsigned keylen, auxlen;
- void *buffer;
+ void *buffer, *p;
char *key;
cache = container_of(_cache, struct cachefiles_cache, cache);
@@ -65,8 +65,12 @@ static struct fscache_object *cachefiles_alloc_object(
if (!buffer)
goto nomem_buffer;
- keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
- ASSERTCMP(keylen, <, 512);
+ keylen = cookie->key_len;
+ if (keylen <= sizeof(cookie->inline_key))
+ p = cookie->inline_key;
+ else
+ p = cookie->key;
+ memcpy(buffer + 2, p, keylen);
*(uint16_t *)buffer = keylen;
((char *)buffer)[keylen + 2] = 0;
@@ -80,15 +84,17 @@ static struct fscache_object *cachefiles_alloc_object(
/* get hold of the auxiliary data and prepend the object type */
auxdata = buffer;
- auxlen = 0;
- if (cookie->def->get_aux) {
- auxlen = cookie->def->get_aux(cookie->netfs_data,
- auxdata->data, 511);
- ASSERTCMP(auxlen, <, 511);
+ auxlen = cookie->aux_len;
+ if (auxlen) {
+ if (auxlen <= sizeof(cookie->inline_aux))
+ p = cookie->inline_aux;
+ else
+ p = cookie->aux;
+ memcpy(auxdata->data, p, auxlen);
}
auxdata->len = auxlen + 1;
- auxdata->type = cookie->def->type;
+ auxdata->type = cookie->type;
lookup_data->auxdata = auxdata;
lookup_data->key = key;
@@ -177,10 +183,12 @@ static void cachefiles_lookup_complete(struct fscache_object *_object)
* increment the usage count on an inode object (may fail if unmounting)
*/
static
-struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why)
{
struct cachefiles_object *object =
container_of(_object, struct cachefiles_object, fscache);
+ int u;
_enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
@@ -188,7 +196,9 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
#endif
- atomic_inc(&object->usage);
+ u = atomic_inc_return(&object->usage);
+ trace_cachefiles_ref(object, _object->cookie,
+ (enum cachefiles_obj_ref_trace)why, u);
return &object->fscache;
}
@@ -202,6 +212,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
struct cachefiles_cache *cache;
struct fscache_cookie *cookie;
const struct cred *saved_cred;
+ const void *aux;
unsigned auxlen;
_enter("{OBJ%x}", _object->debug_id);
@@ -216,26 +227,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
}
cookie = object->fscache.cookie;
+ auxlen = cookie->aux_len;
- if (!cookie->def->get_aux) {
+ if (!auxlen) {
fscache_unuse_cookie(_object);
_leave(" [no aux]");
return;
}
- auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
+ auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp);
if (!auxdata) {
fscache_unuse_cookie(_object);
_leave(" [nomem]");
return;
}
- auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ aux = (auxlen <= sizeof(cookie->inline_aux)) ?
+ cookie->inline_aux : cookie->aux;
+
+ memcpy(auxdata->data, aux, auxlen);
fscache_unuse_cookie(_object);
- ASSERTCMP(auxlen, <, 511);
auxdata->len = auxlen + 1;
- auxdata->type = cookie->def->type;
+ auxdata->type = cookie->type;
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_update_object_xattr(object, auxdata);
@@ -309,10 +323,12 @@ static void cachefiles_drop_object(struct fscache_object *_object)
/*
* dispose of a reference to an object
*/
-static void cachefiles_put_object(struct fscache_object *_object)
+static void cachefiles_put_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why)
{
struct cachefiles_object *object;
struct fscache_cache *cache;
+ int u;
ASSERT(_object);
@@ -328,7 +344,11 @@ static void cachefiles_put_object(struct fscache_object *_object)
ASSERTIFCMP(object->fscache.parent,
object->fscache.parent->n_children, >, 0);
- if (atomic_dec_and_test(&object->usage)) {
+ u = atomic_dec_return(&object->usage);
+ trace_cachefiles_ref(object, _object->cookie,
+ (enum cachefiles_obj_ref_trace)why, u);
+ ASSERTCMP(u, !=, -1);
+ if (u == 0) {
_debug("- kill object OBJ%x", object->fscache.debug_id);
ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
@@ -421,7 +441,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
loff_t oi_size;
int ret;
- _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+ ni_size = _object->store_limit_l;
_enter("{OBJ%x},[%llu]",
_object->debug_id, (unsigned long long) ni_size);
@@ -493,8 +513,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op)
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
- op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
- &ni_size);
+ ni_size = op->object->store_limit_l;
_enter("{OBJ%x},[%llu]",
op->object->debug_id, (unsigned long long)ni_size);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bb3a02ca9da4..d2f6f996e65a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -124,6 +124,8 @@ struct cachefiles_xattr {
uint8_t data[];
};
+#include <trace/events/cachefiles.h>
+
/*
* note change of state for daemon
*/
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 711f13d8c2de..f54d3f5b2e40 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -22,6 +22,7 @@
#include <linux/statfs.h>
#include <linux/sysctl.h>
#include <linux/miscdevice.h>
+#define CREATE_TRACE_POINTS
#include "internal.h"
unsigned cachefiles_debug;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 3978b324cbca..ab0bbe93b398 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -30,11 +30,11 @@
*/
static noinline
void __cachefiles_printk_object(struct cachefiles_object *object,
- const char *prefix,
- u8 *keybuf)
+ const char *prefix)
{
struct fscache_cookie *cookie;
- unsigned keylen, loop;
+ const u8 *k;
+ unsigned loop;
pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
@@ -56,23 +56,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
object->fscache.cookie->parent,
object->fscache.cookie->netfs_data,
object->fscache.cookie->flags);
- if (keybuf && cookie->def)
- keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
- CACHEFILES_KEYBUF_SIZE);
- else
- keylen = 0;
+ pr_err("%skey=[%u] '", prefix, cookie->key_len);
+ k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
+ cookie->inline_key : cookie->key;
+ for (loop = 0; loop < cookie->key_len; loop++)
+ pr_cont("%02x", k[loop]);
+ pr_cont("'\n");
} else {
pr_err("%scookie=NULL\n", prefix);
- keylen = 0;
}
spin_unlock(&object->fscache.lock);
-
- if (keylen) {
- pr_err("%skey=[%u] '", prefix, keylen);
- for (loop = 0; loop < keylen; loop++)
- pr_cont("%02x", keybuf[loop]);
- pr_cont("'\n");
- }
}
/*
@@ -81,14 +74,10 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
static noinline void cachefiles_printk_object(struct cachefiles_object *object,
struct cachefiles_object *xobject)
{
- u8 *keybuf;
-
- keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
if (object)
- __cachefiles_printk_object(object, "", keybuf);
+ __cachefiles_printk_object(object, "");
if (xobject)
- __cachefiles_printk_object(xobject, "x", keybuf);
- kfree(keybuf);
+ __cachefiles_printk_object(xobject, "x");
}
/*
@@ -120,6 +109,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
}
write_unlock(&cache->active_lock);
+ trace_cachefiles_mark_buried(NULL, dentry, why);
_leave(" [no owner]");
return;
@@ -130,6 +120,8 @@ found_dentry:
object->fscache.state->name,
dentry);
+ trace_cachefiles_mark_buried(object, dentry, why);
+
if (fscache_object_is_live(&object->fscache)) {
pr_err("\n");
pr_err("Error: Can't preemptively bury live object\n");
@@ -158,13 +150,15 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
try_again:
write_lock(&cache->active_lock);
+ dentry = object->dentry;
+ trace_cachefiles_mark_active(object, dentry);
+
if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
pr_err("Error: Object already active\n");
cachefiles_printk_object(object, NULL);
BUG();
}
- dentry = object->dentry;
_p = &cache->active_nodes.rb_node;
while (*_p) {
_parent = *_p;
@@ -191,6 +185,8 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
+ trace_cachefiles_wait_active(object, dentry, xobject);
+
if (fscache_object_is_live(&xobject->fscache)) {
pr_err("\n");
pr_err("Error: Unexpected object collision\n");
@@ -248,12 +244,12 @@ wait_for_old_object:
ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
- cache->cache.ops->put_object(&xobject->fscache);
+ cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry);
goto try_again;
requeue:
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
- cache->cache.ops->put_object(&xobject->fscache);
+ cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo);
_leave(" = -ETIMEDOUT");
return -ETIMEDOUT;
}
@@ -265,6 +261,11 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
struct cachefiles_object *object,
blkcnt_t i_blocks)
{
+ struct dentry *dentry = object->dentry;
+ struct inode *inode = d_backing_inode(dentry);
+
+ trace_cachefiles_mark_inactive(object, dentry, inode);
+
write_lock(&cache->active_lock);
rb_erase(&object->active_node, &cache->active_nodes);
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
@@ -288,6 +289,7 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
* - unlocks the directory mutex
*/
static int cachefiles_bury_object(struct cachefiles_cache *cache,
+ struct cachefiles_object *object,
struct dentry *dir,
struct dentry *rep,
bool preemptive,
@@ -312,6 +314,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
if (ret < 0) {
cachefiles_io_error(cache, "Unlink security error");
} else {
+ trace_cachefiles_unlink(object, rep, why);
ret = vfs_unlink(d_inode(dir), rep, NULL);
if (preemptive)
@@ -413,6 +416,7 @@ try_again:
if (ret < 0) {
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
+ trace_cachefiles_rename(object, rep, grave, why);
ret = vfs_rename(d_inode(dir), rep,
d_inode(cache->graveyard), grave, NULL, 0);
if (ret != 0 && ret != -ENOMEM)
@@ -458,7 +462,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
/* we need to check that our parent is _still_ our parent - it
* may have been renamed */
if (dir == object->dentry->d_parent) {
- ret = cachefiles_bury_object(cache, dir,
+ ret = cachefiles_bury_object(cache, object, dir,
object->dentry, false,
FSCACHE_OBJECT_WAS_RETIRED);
} else {
@@ -486,6 +490,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
{
struct cachefiles_cache *cache;
struct dentry *dir, *next = NULL;
+ struct inode *inode;
struct path path;
unsigned long start;
const char *name;
@@ -529,13 +534,17 @@ lookup_again:
start = jiffies;
next = lookup_one_len(name, dir, nlen);
cachefiles_hist(cachefiles_lookup_histogram, start);
- if (IS_ERR(next))
+ if (IS_ERR(next)) {
+ trace_cachefiles_lookup(object, next, NULL);
goto lookup_error;
+ }
- _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative");
+ inode = d_backing_inode(next);
+ trace_cachefiles_lookup(object, next, inode);
+ _debug("next -> %p %s", next, inode ? "positive" : "negative");
if (!key)
- object->new = !d_backing_inode(next);
+ object->new = !inode;
/* if this element of the path doesn't exist, then the lookup phase
* failed, and we can release any readers in the certain knowledge that
@@ -558,9 +567,16 @@ lookup_again:
start = jiffies;
ret = vfs_mkdir(d_inode(dir), next, 0);
cachefiles_hist(cachefiles_mkdir_histogram, start);
+ if (!key)
+ trace_cachefiles_mkdir(object, next, ret);
if (ret < 0)
goto create_error;
+ if (unlikely(d_unhashed(next))) {
+ dput(next);
+ inode_unlock(d_inode(dir));
+ goto lookup_again;
+ }
ASSERT(d_backing_inode(next));
_debug("mkdir -> %p{%p{ino=%lu}}",
@@ -587,6 +603,7 @@ lookup_again:
start = jiffies;
ret = vfs_create(d_inode(dir), next, S_IFREG, true);
cachefiles_hist(cachefiles_create_histogram, start);
+ trace_cachefiles_create(object, next, ret);
if (ret < 0)
goto create_error;
@@ -629,7 +646,8 @@ lookup_again:
* mutex) */
object->dentry = NULL;
- ret = cachefiles_bury_object(cache, dir, next, true,
+ ret = cachefiles_bury_object(cache, object, dir, next,
+ true,
FSCACHE_OBJECT_IS_STALE);
dput(next);
next = NULL;
@@ -751,6 +769,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
/* search the current directory for the element name */
inode_lock(d_inode(dir));
+retry:
start = jiffies;
subdir = lookup_one_len(dirname, dir, strlen(dirname));
cachefiles_hist(cachefiles_lookup_histogram, start);
@@ -780,6 +799,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
if (ret < 0)
goto mkdir_error;
+ if (unlikely(d_unhashed(subdir))) {
+ dput(subdir);
+ goto retry;
+ }
ASSERT(d_backing_inode(subdir));
_debug("mkdir -> %p{%p{ino=%lu}}",
@@ -955,7 +978,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
/* actually remove the victim (drops the dir mutex) */
_debug("bury");
- ret = cachefiles_bury_object(cache, dir, victim, false,
+ ret = cachefiles_bury_object(cache, NULL, dir, victim, false,
FSCACHE_OBJECT_WAS_CULLED);
if (ret < 0)
goto error;
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
index 125b90f6c796..0ce1aa56b67f 100644
--- a/fs/cachefiles/proc.c
+++ b/fs/cachefiles/proc.c
@@ -85,21 +85,6 @@ static const struct seq_operations cachefiles_histogram_ops = {
};
/*
- * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
- */
-static int cachefiles_histogram_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &cachefiles_histogram_ops);
-}
-
-static const struct file_operations cachefiles_histogram_fops = {
- .open = cachefiles_histogram_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-/*
* initialise the /proc/fs/cachefiles/ directory
*/
int __init cachefiles_proc_init(void)
@@ -109,8 +94,8 @@ int __init cachefiles_proc_init(void)
if (!proc_mkdir("fs/cachefiles", NULL))
goto error_dir;
- if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
- &cachefiles_histogram_fops))
+ if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+ &cachefiles_histogram_ops))
goto error_histogram;
_leave(" = 0");
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 883bc7bb12c5..5082c8a49686 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -952,6 +952,7 @@ error:
* - cache withdrawal is prevented by the caller
*/
void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+ __releases(&object->fscache.cookie->lock)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index d31c1a72d8a5..0a29a00aed2e 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -113,6 +113,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
/* attempt to install the cache metadata directly */
_debug("SET #%u", auxdata->len);
+ clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
XATTR_CREATE);
@@ -141,6 +142,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
/* attempt to install the cache metadata directly */
_debug("SET #%u", auxdata->len);
+ clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
XATTR_REPLACE);
@@ -180,7 +182,8 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
goto error;
xlen--;
- validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen);
+ validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen,
+ i_size_read(d_backing_inode(dentry)));
if (validity != FSCACHE_CHECKAUX_OKAY)
goto error;
@@ -249,7 +252,8 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
object->fscache.cookie->def->name, dlen);
result = fscache_check_aux(&object->fscache,
- &auxbuf->data, dlen);
+ &auxbuf->data, dlen,
+ i_size_read(d_backing_inode(dentry)));
switch (result) {
/* entry okay as is */
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 174f5709e508..a699e320393f 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@
obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
- export.o caps.o snap.o xattr.o \
+ export.o caps.o snap.o xattr.o quota.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b4336b42ce3b..5f7ad3d0df2e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -15,6 +15,7 @@
#include "mds_client.h"
#include "cache.h"
#include <linux/ceph/osd_client.h>
+#include <linux/ceph/striper.h>
/*
* Ceph address space ops.
@@ -438,7 +439,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
{
struct inode *inode = file_inode(file);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_file_info *ci = file->private_data;
+ struct ceph_file_info *fi = file->private_data;
struct ceph_rw_context *rw_ctx;
int rc = 0;
int max = 0;
@@ -452,7 +453,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
if (rc == 0)
goto out;
- rw_ctx = ceph_find_rw_context(ci);
+ rw_ctx = ceph_find_rw_context(fi);
max = fsc->mount_options->rsize >> PAGE_SHIFT;
dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
inode, file, rw_ctx, nr_pages, max);
@@ -800,7 +801,7 @@ static int ceph_writepages_start(struct address_space *mapping,
struct ceph_osd_request *req = NULL;
struct ceph_writeback_ctl ceph_wbc;
bool should_loop, range_whole = false;
- bool stop, done = false;
+ bool done = false;
dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
@@ -856,7 +857,7 @@ retry:
* in that range can be associated with newer snapc.
* They are not writeable until we write all dirty pages
* associated with 'snapc' get written */
- if (index > 0 || wbc->sync_mode != WB_SYNC_NONE)
+ if (index > 0)
should_loop = true;
dout(" non-head snapc, range whole\n");
}
@@ -864,8 +865,7 @@ retry:
ceph_put_snap_context(last_snapc);
last_snapc = snapc;
- stop = false;
- while (!stop && index <= end) {
+ while (!done && index <= end) {
int num_ops = 0, op_idx;
unsigned i, pvec_pages, max_pages, locked_pages = 0;
struct page **pages = NULL, **data_pages;
@@ -898,16 +898,30 @@ get_more_pages:
unlock_page(page);
continue;
}
- if (strip_unit_end && (page->index > strip_unit_end)) {
- dout("end of strip unit %p\n", page);
+ /* only if matching snap context */
+ pgsnapc = page_snap_context(page);
+ if (pgsnapc != snapc) {
+ dout("page snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+ if (!should_loop &&
+ !ceph_wbc.head_snapc &&
+ wbc->sync_mode != WB_SYNC_NONE)
+ should_loop = true;
unlock_page(page);
- break;
+ continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
dout("%p page eof %llu\n",
page, ceph_wbc.i_size);
- /* not done if range_cyclic */
- stop = true;
+ if (ceph_wbc.size_stable ||
+ page_offset(page) >= i_size_read(inode))
+ mapping->a_ops->invalidatepage(page,
+ 0, PAGE_SIZE);
+ unlock_page(page);
+ continue;
+ }
+ if (strip_unit_end && (page->index > strip_unit_end)) {
+ dout("end of strip unit %p\n", page);
unlock_page(page);
break;
}
@@ -921,15 +935,6 @@ get_more_pages:
wait_on_page_writeback(page);
}
- /* only if matching snap context */
- pgsnapc = page_snap_context(page);
- if (pgsnapc != snapc) {
- dout("page snapc %p %lld != oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
- unlock_page(page);
- continue;
- }
-
if (!clear_page_dirty_for_io(page)) {
dout("%p !clear_page_dirty_for_io\n", page);
unlock_page(page);
@@ -945,19 +950,15 @@ get_more_pages:
if (locked_pages == 0) {
u64 objnum;
u64 objoff;
+ u32 xlen;
/* prepare async write request */
offset = (u64)page_offset(page);
- len = wsize;
-
- rc = ceph_calc_file_object_mapping(&ci->i_layout,
- offset, len,
- &objnum, &objoff,
- &len);
- if (rc < 0) {
- unlock_page(page);
- break;
- }
+ ceph_calc_file_object_mapping(&ci->i_layout,
+ offset, wsize,
+ &objnum, &objoff,
+ &xlen);
+ len = xlen;
num_ops = 1;
strip_unit_end = page->index +
@@ -1146,7 +1147,7 @@ new_request:
* we tagged for writeback prior to entering this loop.
*/
if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
- done = stop = true;
+ done = true;
release_pvec_pages:
dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a3ab265d3215..bb524c880b1e 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -27,7 +27,6 @@
struct ceph_aux_inode {
u64 version;
struct timespec mtime;
- loff_t size;
};
struct fscache_netfs ceph_cache_netfs = {
@@ -41,37 +40,18 @@ static LIST_HEAD(ceph_fscache_list);
struct ceph_fscache_entry {
struct list_head list;
struct fscache_cookie *fscache;
- struct ceph_fsid fsid;
size_t uniq_len;
+ /* The following members must be last */
+ struct ceph_fsid fsid;
char uniquifier[0];
};
-static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct ceph_fs_client* fsc = cookie_netfs_data;
- const char *fscache_uniq = fsc->mount_options->fscache_uniq;
- uint16_t fsid_len, uniq_len;
-
- fsid_len = sizeof(fsc->client->fsid);
- uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
- if (fsid_len + uniq_len > maxbuf)
- return 0;
-
- memcpy(buffer, &fsc->client->fsid, fsid_len);
- if (uniq_len)
- memcpy(buffer + fsid_len, fscache_uniq, uniq_len);
-
- return fsid_len + uniq_len;
-}
-
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
.name = "CEPH.fsid",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = ceph_fscache_session_get_key,
};
-int ceph_fscache_register(void)
+int __init ceph_fscache_register(void)
{
return fscache_register_netfs(&ceph_cache_netfs);
}
@@ -110,16 +90,19 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
goto out_unlock;
}
+ memcpy(&ent->fsid, fsid, sizeof(*fsid));
+ if (uniq_len > 0) {
+ memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
+ ent->uniq_len = uniq_len;
+ }
+
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
- fsc, true);
+ &ent->fsid, sizeof(ent->fsid) + uniq_len,
+ NULL, 0,
+ fsc, 0, true);
if (fsc->fscache) {
- memcpy(&ent->fsid, fsid, sizeof(*fsid));
- if (uniq_len > 0) {
- memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
- ent->uniq_len = uniq_len;
- }
ent->fscache = fsc->fscache;
list_add_tail(&ent->list, &ceph_fscache_list);
} else {
@@ -133,73 +116,32 @@ out_unlock:
return err;
}
-static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct ceph_inode_info* ci = cookie_netfs_data;
- uint16_t klen;
-
- /* use ceph virtual inode (id + snapshot) */
- klen = sizeof(ci->i_vino);
- if (klen > maxbuf)
- return 0;
-
- memcpy(buffer, &ci->i_vino, klen);
- return klen;
-}
-
-static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- struct ceph_aux_inode aux;
- const struct ceph_inode_info* ci = cookie_netfs_data;
- const struct inode* inode = &ci->vfs_inode;
-
- memset(&aux, 0, sizeof(aux));
- aux.version = ci->i_version;
- aux.mtime = inode->i_mtime;
- aux.size = i_size_read(inode);
-
- memcpy(buffer, &aux, sizeof(aux));
-
- return sizeof(aux);
-}
-
-static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct ceph_inode_info* ci = cookie_netfs_data;
- *size = i_size_read(&ci->vfs_inode);
-}
-
static enum fscache_checkaux ceph_fscache_inode_check_aux(
- void *cookie_netfs_data, const void *data, uint16_t dlen)
+ void *cookie_netfs_data, const void *data, uint16_t dlen,
+ loff_t object_size)
{
struct ceph_aux_inode aux;
struct ceph_inode_info* ci = cookie_netfs_data;
struct inode* inode = &ci->vfs_inode;
- if (dlen != sizeof(aux))
+ if (dlen != sizeof(aux) ||
+ i_size_read(inode) != object_size)
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&aux, 0, sizeof(aux));
aux.version = ci->i_version;
aux.mtime = inode->i_mtime;
- aux.size = i_size_read(inode);
if (memcmp(data, &aux, sizeof(aux)) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
- dout("ceph inode 0x%p cached okay", ci);
+ dout("ceph inode 0x%p cached okay\n", ci);
return FSCACHE_CHECKAUX_OKAY;
}
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.name = "CEPH.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = ceph_fscache_inode_get_key,
- .get_attr = ceph_fscache_inode_get_attr,
- .get_aux = ceph_fscache_inode_get_aux,
.check_aux = ceph_fscache_inode_check_aux,
};
@@ -207,6 +149,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_aux_inode aux;
/* No caching for filesystem */
if (!fsc->fscache)
@@ -218,9 +161,14 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
inode_lock_nested(inode, I_MUTEX_CHILD);
if (!ci->fscache) {
+ memset(&aux, 0, sizeof(aux));
+ aux.version = ci->i_version;
+ aux.mtime = inode->i_mtime;
ci->fscache = fscache_acquire_cookie(fsc->fscache,
- &ceph_fscache_inode_object_def,
- ci, false);
+ &ceph_fscache_inode_object_def,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &aux, sizeof(aux),
+ ci, i_size_read(inode), false);
}
inode_unlock(inode);
}
@@ -235,7 +183,7 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
ci->fscache = NULL;
fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
- fscache_relinquish_cookie(cookie, 0);
+ fscache_relinquish_cookie(cookie, &ci->i_vino, false);
}
static bool ceph_fscache_can_enable(void *data)
@@ -254,11 +202,11 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
if (inode_is_open_for_write(inode)) {
dout("fscache_file_set_cookie %p %p disabling cache\n",
inode, filp);
- fscache_disable_cookie(ci->fscache, false);
+ fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
fscache_uncache_all_inode_pages(ci->fscache, inode);
} else {
- fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
- inode);
+ fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
+ ceph_fscache_can_enable, inode);
if (fscache_cookie_enabled(ci->fscache)) {
dout("fscache_file_set_cookie %p %p enabling cache\n",
inode, filp);
@@ -351,7 +299,8 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
if (!cache_valid(ci))
return;
- ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
+ ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
+ GFP_KERNEL);
if (ret)
fscache_uncache_page(ci->fscache, page);
}
@@ -385,7 +334,7 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
WARN_ON_ONCE(!found);
mutex_unlock(&ceph_fscache_lock);
- __fscache_relinquish_cookie(fsc->fscache, 0);
+ __fscache_relinquish_cookie(fsc->fscache, NULL, false);
}
fsc->fscache = NULL;
}
@@ -402,7 +351,7 @@ void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
* truncate while the caller holds CEPH_CAP_FILE_RD */
mutex_lock(&ci->i_truncate_mutex);
if (!cache_valid(ci)) {
- if (fscache_check_consistency(ci->fscache))
+ if (fscache_check_consistency(ci->fscache, &ci->i_vino))
fscache_invalidate(ci->fscache);
spin_lock(&ci->i_ceph_lock);
ci->i_fscache_gen = ci->i_rdcache_gen;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0e5bd3e3344e..23dbfae16156 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -184,36 +184,54 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
mdsc->caps_avail_count);
spin_unlock(&mdsc->caps_list_lock);
- for (i = have; i < need; i++) {
-retry:
+ for (i = have; i < need; ) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- if (!trimmed) {
- for (j = 0; j < mdsc->max_sessions; j++) {
- s = __ceph_lookup_mds_session(mdsc, j);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
+ if (cap) {
+ list_add(&cap->caps_item, &newcaps);
+ alloc++;
+ i++;
+ continue;
+ }
- mutex_lock(&s->s_mutex);
- max_caps = s->s_nr_caps - (need - i);
- ceph_trim_caps(mdsc, s, max_caps);
- mutex_unlock(&s->s_mutex);
+ if (!trimmed) {
+ for (j = 0; j < mdsc->max_sessions; j++) {
+ s = __ceph_lookup_mds_session(mdsc, j);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- trimmed = true;
- goto retry;
- } else {
- pr_warn("reserve caps ctx=%p ENOMEM "
- "need=%d got=%d\n",
- ctx, need, have + alloc);
- goto out_nomem;
+ mutex_lock(&s->s_mutex);
+ max_caps = s->s_nr_caps - (need - i);
+ ceph_trim_caps(mdsc, s, max_caps);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
}
+ trimmed = true;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ int more_have;
+ if (mdsc->caps_avail_count >= need - i)
+ more_have = need - i;
+ else
+ more_have = mdsc->caps_avail_count;
+
+ i += more_have;
+ have += more_have;
+ mdsc->caps_avail_count -= more_have;
+ mdsc->caps_reserve_count += more_have;
+
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
+ continue;
}
- list_add(&cap->caps_item, &newcaps);
- alloc++;
+
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
+ goto out_nomem;
}
BUG_ON(have + alloc != need);
@@ -234,16 +252,28 @@ retry:
return 0;
out_nomem:
+
+ spin_lock(&mdsc->caps_list_lock);
+ mdsc->caps_avail_count += have;
+ mdsc->caps_reserve_count -= have;
+
while (!list_empty(&newcaps)) {
cap = list_first_entry(&newcaps,
struct ceph_cap, caps_item);
list_del(&cap->caps_item);
- kmem_cache_free(ceph_cap_cachep, cap);
+
+ /* Keep some preallocated caps around (ceph_min_count), to
+ * avoid lots of free/alloc churn. */
+ if (mdsc->caps_avail_count >=
+ mdsc->caps_reserve_count + mdsc->caps_min_count) {
+ kmem_cache_free(ceph_cap_cachep, cap);
+ } else {
+ mdsc->caps_avail_count++;
+ mdsc->caps_total_count++;
+ list_add(&cap->caps_item, &mdsc->caps_list);
+ }
}
- spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_avail_count += have;
- mdsc->caps_reserve_count -= have;
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
mdsc->caps_reserve_count +
mdsc->caps_avail_count);
@@ -254,12 +284,26 @@ out_nomem:
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx)
{
+ int i;
+ struct ceph_cap *cap;
+
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
if (ctx->count) {
spin_lock(&mdsc->caps_list_lock);
BUG_ON(mdsc->caps_reserve_count < ctx->count);
mdsc->caps_reserve_count -= ctx->count;
- mdsc->caps_avail_count += ctx->count;
+ if (mdsc->caps_avail_count >=
+ mdsc->caps_reserve_count + mdsc->caps_min_count) {
+ mdsc->caps_total_count -= ctx->count;
+ for (i = 0; i < ctx->count; i++) {
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ } else {
+ mdsc->caps_avail_count += ctx->count;
+ }
ctx->count = 0;
dout("unreserve caps %d = %d used + %d resv + %d avail\n",
mdsc->caps_total_count, mdsc->caps_use_count,
@@ -285,7 +329,23 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
mdsc->caps_use_count++;
mdsc->caps_total_count++;
spin_unlock(&mdsc->caps_list_lock);
+ } else {
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ BUG_ON(list_empty(&mdsc->caps_list));
+
+ mdsc->caps_avail_count--;
+ mdsc->caps_use_count++;
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count + mdsc->caps_avail_count);
+ }
+ spin_unlock(&mdsc->caps_list_lock);
}
+
return cap;
}
@@ -341,6 +401,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
{
struct ceph_mds_client *mdsc = fsc->mdsc;
+ spin_lock(&mdsc->caps_list_lock);
+
if (total)
*total = mdsc->caps_total_count;
if (avail)
@@ -351,6 +413,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
*reserved = mdsc->caps_reserve_count;
if (min)
*min = mdsc->caps_min_count;
+
+ spin_unlock(&mdsc->caps_list_lock);
}
/*
@@ -639,9 +703,11 @@ void ceph_add_cap(struct inode *inode,
}
spin_lock(&realm->inodes_with_caps_lock);
- ci->i_snap_realm = realm;
list_add(&ci->i_snap_realm_item,
&realm->inodes_with_caps);
+ ci->i_snap_realm = realm;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
spin_unlock(&realm->inodes_with_caps_lock);
if (oldrealm)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 644def813754..abdf98deeec4 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -260,7 +260,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
- 0600,
+ 0400,
fsc->client->debugfs_dir,
fsc,
&mdsmap_show_fops);
@@ -268,7 +268,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
- 0600,
+ 0400,
fsc->client->debugfs_dir,
fsc,
&mds_sessions_show_fops);
@@ -276,7 +276,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
- 0600,
+ 0400,
fsc->client->debugfs_dir,
fsc,
&mdsc_show_fops);
@@ -292,7 +292,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0600,
+ 0400,
fsc->client->debugfs_dir,
fsc,
&dentry_lru_show_fops);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f1d9c6cc0491..1a78dd6f8bf2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2,7 +2,6 @@
#include <linux/ceph/ceph_debug.h>
#include <linux/spinlock.h>
-#include <linux/fs_struct.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/sched.h>
@@ -102,18 +101,18 @@ static int fpos_cmp(loff_t l, loff_t r)
* regardless of what dir changes take place on the
* server.
*/
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
int len, unsigned next_offset)
{
char *buf = kmalloc(len+1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- kfree(fi->last_name);
- fi->last_name = buf;
- memcpy(fi->last_name, name, len);
- fi->last_name[len] = 0;
- fi->next_offset = next_offset;
- dout("note_last_dentry '%s'\n", fi->last_name);
+ kfree(dfi->last_name);
+ dfi->last_name = buf;
+ memcpy(dfi->last_name, name, len);
+ dfi->last_name[len] = 0;
+ dfi->next_offset = next_offset;
+ dout("note_last_dentry '%s'\n", dfi->last_name);
return 0;
}
@@ -175,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
static int __dcache_readdir(struct file *file, struct dir_context *ctx,
int shared_gen)
{
- struct ceph_file_info *fi = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct dentry *parent = file->f_path.dentry;
struct inode *dir = d_inode(parent);
struct dentry *dentry, *last = NULL;
@@ -222,7 +221,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
bool emit_dentry = false;
dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
if (!dentry) {
- fi->flags |= CEPH_F_ATEND;
+ dfi->file_info.flags |= CEPH_F_ATEND;
err = 0;
break;
}
@@ -273,33 +272,33 @@ out:
if (last) {
int ret;
di = ceph_dentry(last);
- ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
+ ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
fpos_off(di->offset) + 1);
if (ret < 0)
err = ret;
dput(last);
/* last_name no longer match cache index */
- if (fi->readdir_cache_idx >= 0) {
- fi->readdir_cache_idx = -1;
- fi->dir_release_count = 0;
+ if (dfi->readdir_cache_idx >= 0) {
+ dfi->readdir_cache_idx = -1;
+ dfi->dir_release_count = 0;
}
}
return err;
}
-static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
{
- if (!fi->last_readdir)
+ if (!dfi->last_readdir)
return true;
if (is_hash_order(pos))
- return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+ return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos));
else
- return fi->frag != fpos_frag(pos);
+ return dfi->frag != fpos_frag(pos);
}
static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -310,7 +309,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_mds_reply_info_parsed *rinfo;
dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
- if (fi->flags & CEPH_F_ATEND)
+ if (dfi->file_info.flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
@@ -351,15 +350,15 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* proceed with a normal readdir */
more:
/* do we have the correct frag content buffered? */
- if (need_send_readdir(fi, ctx->pos)) {
+ if (need_send_readdir(dfi, ctx->pos)) {
struct ceph_mds_request *req;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
/* discard old result, if any */
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ if (dfi->last_readdir) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
}
if (is_hash_order(ctx->pos)) {
@@ -373,7 +372,7 @@ more:
}
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
- ceph_vinop(inode), frag, fi->last_name);
+ ceph_vinop(inode), frag, dfi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -389,8 +388,8 @@ more:
__set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
req->r_inode_drop = CEPH_CAP_FILE_EXCL;
}
- if (fi->last_name) {
- req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
+ if (dfi->last_name) {
+ req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
return -ENOMEM;
@@ -400,10 +399,10 @@ more:
cpu_to_le32(fpos_hash(ctx->pos));
}
- req->r_dir_release_cnt = fi->dir_release_count;
- req->r_dir_ordered_cnt = fi->dir_ordered_count;
- req->r_readdir_cache_idx = fi->readdir_cache_idx;
- req->r_readdir_offset = fi->next_offset;
+ req->r_dir_release_cnt = dfi->dir_release_count;
+ req->r_dir_ordered_cnt = dfi->dir_ordered_count;
+ req->r_readdir_cache_idx = dfi->readdir_cache_idx;
+ req->r_readdir_offset = dfi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
req->r_args.readdir.flags =
cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
@@ -427,35 +426,35 @@ more:
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (!rinfo->hash_order) {
- fi->next_offset = req->r_readdir_offset;
+ dfi->next_offset = req->r_readdir_offset;
/* adjust ctx->pos to beginning of frag */
ctx->pos = ceph_make_fpos(frag,
- fi->next_offset,
+ dfi->next_offset,
false);
}
}
- fi->frag = frag;
- fi->last_readdir = req;
+ dfi->frag = frag;
+ dfi->last_readdir = req;
if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
- fi->readdir_cache_idx = req->r_readdir_cache_idx;
- if (fi->readdir_cache_idx < 0) {
+ dfi->readdir_cache_idx = req->r_readdir_cache_idx;
+ if (dfi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */
- fi->dir_ordered_count = 0;
+ dfi->dir_ordered_count = 0;
} else if (ceph_frag_is_leftmost(frag) &&
- fi->next_offset == 2) {
+ dfi->next_offset == 2) {
/* note dir version at start of readdir so
* we can tell if any dentries get dropped */
- fi->dir_release_count = req->r_dir_release_cnt;
- fi->dir_ordered_count = req->r_dir_ordered_cnt;
+ dfi->dir_release_count = req->r_dir_release_cnt;
+ dfi->dir_ordered_count = req->r_dir_ordered_cnt;
}
} else {
- dout("readdir !did_prepopulate");
+ dout("readdir !did_prepopulate\n");
/* disable readdir cache */
- fi->readdir_cache_idx = -1;
+ dfi->readdir_cache_idx = -1;
/* preclude from marking dir complete */
- fi->dir_release_count = 0;
+ dfi->dir_release_count = 0;
}
/* note next offset and last dentry name */
@@ -464,19 +463,19 @@ more:
rinfo->dir_entries + (rinfo->dir_nr-1);
unsigned next_offset = req->r_reply_info.dir_end ?
2 : (fpos_off(rde->offset) + 1);
- err = note_last_dentry(fi, rde->name, rde->name_len,
+ err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
if (err)
return err;
} else if (req->r_reply_info.dir_end) {
- fi->next_offset = 2;
+ dfi->next_offset = 2;
/* keep last name */
}
}
- rinfo = &fi->last_readdir->r_reply_info;
+ rinfo = &dfi->last_readdir->r_reply_info;
dout("readdir frag %x num %d pos %llx chunk first %llx\n",
- fi->frag, rinfo->dir_nr, ctx->pos,
+ dfi->frag, rinfo->dir_nr, ctx->pos,
rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
i = 0;
@@ -520,52 +519,55 @@ more:
ctx->pos++;
}
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
- if (fi->next_offset > 2) {
- frag = fi->frag;
+ if (dfi->next_offset > 2) {
+ frag = dfi->frag;
goto more;
}
/* more frags? */
- if (!ceph_frag_is_rightmost(fi->frag)) {
- frag = ceph_frag_next(fi->frag);
+ if (!ceph_frag_is_rightmost(dfi->frag)) {
+ frag = ceph_frag_next(dfi->frag);
if (is_hash_order(ctx->pos)) {
loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
- fi->next_offset, true);
+ dfi->next_offset, true);
if (new_pos > ctx->pos)
ctx->pos = new_pos;
/* keep last_name */
} else {
- ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
- kfree(fi->last_name);
- fi->last_name = NULL;
+ ctx->pos = ceph_make_fpos(frag, dfi->next_offset,
+ false);
+ kfree(dfi->last_name);
+ dfi->last_name = NULL;
}
dout("readdir next frag is %x\n", frag);
goto more;
}
- fi->flags |= CEPH_F_ATEND;
+ dfi->file_info.flags |= CEPH_F_ATEND;
/*
* if dir_release_count still matches the dir, no dentries
* were released during the whole readdir, and we should have
* the complete dir contents in our cache.
*/
- if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+ if (atomic64_read(&ci->i_release_count) ==
+ dfi->dir_release_count) {
spin_lock(&ci->i_ceph_lock);
- if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
+ if (dfi->dir_ordered_count ==
+ atomic64_read(&ci->i_ordered_count)) {
dout(" marking %p complete and ordered\n", inode);
/* use i_size to track number of entries in
* readdir cache */
- BUG_ON(fi->readdir_cache_idx < 0);
- i_size_write(inode, fi->readdir_cache_idx *
+ BUG_ON(dfi->readdir_cache_idx < 0);
+ i_size_write(inode, dfi->readdir_cache_idx *
sizeof(struct dentry*));
} else {
dout(" marking %p complete\n", inode);
}
- __ceph_dir_set_complete(ci, fi->dir_release_count,
- fi->dir_ordered_count);
+ __ceph_dir_set_complete(ci, dfi->dir_release_count,
+ dfi->dir_ordered_count);
spin_unlock(&ci->i_ceph_lock);
}
@@ -573,25 +575,25 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_dir_file_info *dfi)
{
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ if (dfi->last_readdir) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
}
- kfree(fi->last_name);
- fi->last_name = NULL;
- fi->dir_release_count = 0;
- fi->readdir_cache_idx = -1;
- fi->next_offset = 2; /* compensate for . and .. */
- fi->flags &= ~CEPH_F_ATEND;
+ kfree(dfi->last_name);
+ dfi->last_name = NULL;
+ dfi->dir_release_count = 0;
+ dfi->readdir_cache_idx = -1;
+ dfi->next_offset = 2; /* compensate for . and .. */
+ dfi->file_info.flags &= ~CEPH_F_ATEND;
}
/*
* discard buffered readdir content on seekdir(0), or seek to new frag,
* or seek prior to current chunk
*/
-static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
{
struct ceph_mds_reply_info_parsed *rinfo;
loff_t chunk_offset;
@@ -600,10 +602,10 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
if (is_hash_order(new_pos)) {
/* no need to reset last_name for a forward seek when
* dentries are sotred in hash order */
- } else if (fi->frag != fpos_frag(new_pos)) {
+ } else if (dfi->frag != fpos_frag(new_pos)) {
return true;
}
- rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+ rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
if (!rinfo || !rinfo->dir_nr)
return true;
chunk_offset = rinfo->dir_entries[0].offset;
@@ -613,7 +615,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
- struct ceph_file_info *fi = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file->f_mapping->host;
loff_t retval;
@@ -631,20 +633,20 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
if (offset >= 0) {
- if (need_reset_readdir(fi, offset)) {
+ if (need_reset_readdir(dfi, offset)) {
dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
+ reset_readdir(dfi);
} else if (is_hash_order(offset) && offset > file->f_pos) {
/* for hash offset, we don't know if a forward seek
* is within same frag */
- fi->dir_release_count = 0;
- fi->readdir_cache_idx = -1;
+ dfi->dir_release_count = 0;
+ dfi->readdir_cache_idx = -1;
}
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
- fi->flags &= ~CEPH_F_ATEND;
+ dfi->file_info.flags &= ~CEPH_F_ATEND;
}
retval = offset;
}
@@ -825,6 +827,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ if (ceph_quota_is_max_files_exceeded(dir))
+ return -EDQUOT;
+
err = ceph_pre_init_acls(dir, &mode, &acls);
if (err < 0)
return err;
@@ -878,6 +883,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ if (ceph_quota_is_max_files_exceeded(dir))
+ return -EDQUOT;
+
dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
@@ -927,6 +935,12 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out;
}
+ if (op == CEPH_MDS_OP_MKDIR &&
+ ceph_quota_is_max_files_exceeded(dir)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
mode |= S_IFDIR;
err = ceph_pre_init_acls(dir, &mode, &acls);
if (err < 0)
@@ -1066,6 +1080,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
else
return -EROFS;
}
+ /* don't allow cross-quota renames */
+ if ((old_dir != new_dir) &&
+ (!ceph_quota_is_same_realm(old_dir, new_dir)))
+ return -EXDEV;
+
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1352,7 +1371,7 @@ static void ceph_d_prune(struct dentry *dentry)
static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
- struct ceph_file_info *cf = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
@@ -1361,12 +1380,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR;
- if (!cf->dir_info) {
- cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
- if (!cf->dir_info)
+ if (!dfi->dir_info) {
+ dfi->dir_info = kmalloc(bufsize, GFP_KERNEL);
+ if (!dfi->dir_info)
return -ENOMEM;
- cf->dir_info_len =
- snprintf(cf->dir_info, bufsize,
+ dfi->dir_info_len =
+ snprintf(dfi->dir_info, bufsize,
"entries: %20lld\n"
" files: %20lld\n"
" subdirs: %20lld\n"
@@ -1386,10 +1405,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
(long)ci->i_rctime.tv_nsec);
}
- if (*ppos >= cf->dir_info_len)
+ if (*ppos >= dfi->dir_info_len)
return 0;
- size = min_t(unsigned, size, cf->dir_info_len-*ppos);
- left = copy_to_user(buf, cf->dir_info + *ppos, size);
+ size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
+ left = copy_to_user(buf, dfi->dir_info + *ppos, size);
if (left == size)
return -EFAULT;
*ppos += (size - left);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b67eec3532a1..cf0e45b10121 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -30,6 +30,8 @@ static __le32 ceph_flags_sys2wire(u32 flags)
break;
}
+ flags &= ~O_ACCMODE;
+
#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
ceph_sys2wire(O_CREAT);
@@ -41,7 +43,7 @@ static __le32 ceph_flags_sys2wire(u32 flags)
#undef ceph_sys2wire
if (flags)
- dout("unused open flags: %x", flags);
+ dout("unused open flags: %x\n", flags);
return cpu_to_le32(wire_flags);
}
@@ -68,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags)
*/
/*
- * Calculate the length sum of direct io vectors that can
- * be combined into one page vector.
+ * How many pages to get in one call to iov_iter_get_pages(). This
+ * determines the size of the on-stack array used as a buffer.
*/
-static size_t dio_get_pagev_size(const struct iov_iter *it)
+#define ITER_GET_BVECS_PAGES 64
+
+static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec *bvecs)
{
- const struct iovec *iov = it->iov;
- const struct iovec *iovend = iov + it->nr_segs;
- size_t size;
-
- size = iov->iov_len - it->iov_offset;
- /*
- * An iov can be page vectored when both the current tail
- * and the next base are page aligned.
- */
- while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
- (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
- size += iov->iov_len;
- }
- dout("dio_get_pagevlen len = %zu\n", size);
- return size;
+ size_t size = 0;
+ int bvec_idx = 0;
+
+ if (maxsize > iov_iter_count(iter))
+ maxsize = iov_iter_count(iter);
+
+ while (size < maxsize) {
+ struct page *pages[ITER_GET_BVECS_PAGES];
+ ssize_t bytes;
+ size_t start;
+ int idx = 0;
+
+ bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ ITER_GET_BVECS_PAGES, &start);
+ if (bytes < 0)
+ return size ?: bytes;
+
+ iov_iter_advance(iter, bytes);
+ size += bytes;
+
+ for ( ; bytes; idx++, bvec_idx++) {
+ struct bio_vec bv = {
+ .bv_page = pages[idx],
+ .bv_len = min_t(int, bytes, PAGE_SIZE - start),
+ .bv_offset = start,
+ };
+
+ bvecs[bvec_idx] = bv;
+ bytes -= bv.bv_len;
+ start = 0;
+ }
+ }
+
+ return size;
}
/*
- * Allocate a page vector based on (@it, @nbytes).
- * The return value is the tuple describing a page vector,
- * that is (@pages, @page_align, @num_pages).
+ * iov_iter_get_pages() only considers one iov_iter segment, no matter
+ * what maxsize or maxpages are given. For ITER_BVEC that is a single
+ * page.
+ *
+ * Attempt to get up to @maxsize bytes worth of pages from @iter.
+ * Return the number of bytes in the created bio_vec array, or an error.
*/
-static struct page **
-dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
- size_t *page_align, int *num_pages)
+static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec **bvecs, int *num_bvecs)
{
- struct iov_iter tmp_it = *it;
- size_t align;
- struct page **pages;
- int ret = 0, idx, npages;
+ struct bio_vec *bv;
+ size_t orig_count = iov_iter_count(iter);
+ ssize_t bytes;
+ int npages;
- align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
- (PAGE_SIZE - 1);
- npages = calc_pages_for(align, nbytes);
- pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
- if (!pages)
- return ERR_PTR(-ENOMEM);
+ iov_iter_truncate(iter, maxsize);
+ npages = iov_iter_npages(iter, INT_MAX);
+ iov_iter_reexpand(iter, orig_count);
- for (idx = 0; idx < npages; ) {
- size_t start;
- ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
- npages - idx, &start);
- if (ret < 0)
- goto fail;
+ /*
+ * __iter_get_bvecs() may populate only part of the array -- zero it
+ * out.
+ */
+ bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
+ if (!bv)
+ return -ENOMEM;
- iov_iter_advance(&tmp_it, ret);
- nbytes -= ret;
- idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ bytes = __iter_get_bvecs(iter, maxsize, bv);
+ if (bytes < 0) {
+ /*
+ * No pages were pinned -- just free the array.
+ */
+ kvfree(bv);
+ return bytes;
}
- BUG_ON(nbytes != 0);
- *num_pages = npages;
- *page_align = align;
- dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
- return pages;
-fail:
- ceph_put_page_vector(pages, idx, false);
- return ERR_PTR(ret);
+ *bvecs = bv;
+ *num_bvecs = npages;
+ return bytes;
+}
+
+static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
+{
+ int i;
+
+ for (i = 0; i < num_bvecs; i++) {
+ if (bvecs[i].bv_page) {
+ if (should_dirty)
+ set_page_dirty_lock(bvecs[i].bv_page);
+ put_page(bvecs[i].bv_page);
+ }
+ }
+ kvfree(bvecs);
}
/*
@@ -159,13 +196,50 @@ out:
return req;
}
+static int ceph_init_file_info(struct inode *inode, struct file *file,
+ int fmode, bool isdir)
+{
+ struct ceph_file_info *fi;
+
+ dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
+ inode->i_mode, isdir ? "dir" : "regular");
+ BUG_ON(inode->i_fop->release != ceph_release);
+
+ if (isdir) {
+ struct ceph_dir_file_info *dfi =
+ kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
+ if (!dfi) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+
+ file->private_data = dfi;
+ fi = &dfi->file_info;
+ dfi->next_offset = 2;
+ dfi->readdir_cache_idx = -1;
+ } else {
+ fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
+ if (!fi) {
+ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ return -ENOMEM;
+ }
+
+ file->private_data = fi;
+ }
+
+ fi->fmode = fmode;
+ spin_lock_init(&fi->rw_contexts_lock);
+ INIT_LIST_HEAD(&fi->rw_contexts);
+
+ return 0;
+}
+
/*
* initialize private struct file data.
* if we fail, clean up by dropping fmode reference on the ceph_inode
*/
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
- struct ceph_file_info *cf;
int ret = 0;
switch (inode->i_mode & S_IFMT) {
@@ -173,22 +247,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
ceph_fscache_register_inode_cookie(inode);
ceph_fscache_file_set_cookie(inode, file);
case S_IFDIR:
- dout("init_file %p %p 0%o (regular)\n", inode, file,
- inode->i_mode);
- cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (!cf) {
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- return -ENOMEM;
- }
- cf->fmode = fmode;
-
- spin_lock_init(&cf->rw_contexts_lock);
- INIT_LIST_HEAD(&cf->rw_contexts);
-
- cf->next_offset = 2;
- cf->readdir_cache_idx = -1;
- file->private_data = cf;
- BUG_ON(inode->i_fop->release != ceph_release);
+ ret = ceph_init_file_info(inode, file, fmode,
+ S_ISDIR(inode->i_mode));
+ if (ret)
+ return ret;
break;
case S_IFLNK:
@@ -278,11 +340,11 @@ int ceph_open(struct inode *inode, struct file *file)
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
- struct ceph_file_info *cf = file->private_data;
+ struct ceph_file_info *fi = file->private_data;
int err;
int flags, fmode, wanted;
- if (cf) {
+ if (fi) {
dout("open file %p is already opened\n", file);
return 0;
}
@@ -375,7 +437,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acls_info acls = {};
- int mask;
+ int mask;
int err;
dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -386,6 +448,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
return -ENAMETOOLONG;
if (flags & O_CREAT) {
+ if (ceph_quota_is_max_files_exceeded(dir))
+ return -EDQUOT;
err = ceph_pre_init_acls(dir, &mode, &acls);
if (err < 0)
return err;
@@ -460,16 +524,27 @@ out_acl:
int ceph_release(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *cf = file->private_data;
- dout("release inode %p file %p\n", inode, file);
- ceph_put_fmode(ci, cf->fmode);
- if (cf->last_readdir)
- ceph_mdsc_put_request(cf->last_readdir);
- kfree(cf->last_name);
- kfree(cf->dir_info);
- WARN_ON(!list_empty(&cf->rw_contexts));
- kmem_cache_free(ceph_file_cachep, cf);
+ if (S_ISDIR(inode->i_mode)) {
+ struct ceph_dir_file_info *dfi = file->private_data;
+ dout("release inode %p dir file %p\n", inode, file);
+ WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
+
+ ceph_put_fmode(ci, dfi->file_info.fmode);
+
+ if (dfi->last_readdir)
+ ceph_mdsc_put_request(dfi->last_readdir);
+ kfree(dfi->last_name);
+ kfree(dfi->dir_info);
+ kmem_cache_free(ceph_dir_file_cachep, dfi);
+ } else {
+ struct ceph_file_info *fi = file->private_data;
+ dout("release inode %p regular file %p\n", inode, file);
+ WARN_ON(!list_empty(&fi->rw_contexts));
+
+ ceph_put_fmode(ci, fi->fmode);
+ kmem_cache_free(ceph_file_cachep, fi);
+ }
/* wake up anyone waiting for caps on this inode */
wake_up_all(&ci->i_cap_wq);
@@ -706,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
- int num_pages = calc_pages_for((u64)osd_data->alignment,
- osd_data->length);
- dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
- inode, rc, osd_data->length);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
+ BUG_ON(!osd_data->num_bvecs);
+
+ dout("ceph_aio_complete_req %p rc %d bytes %u\n",
+ inode, rc, osd_data->bvec_pos.iter.bi_size);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
@@ -728,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
} else if (!aio_req->write) {
if (rc == -ENOENT)
rc = 0;
- if (rc >= 0 && osd_data->length > rc) {
- int zoff = osd_data->alignment + rc;
- int zlen = osd_data->length - rc;
+ if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
+ struct iov_iter i;
+ int zlen = osd_data->bvec_pos.iter.bi_size - rc;
+
/*
* If read is satisfied by single OSD request,
* it can pass EOF. Otherwise read is within
@@ -745,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen;
}
- if (zlen > 0)
- ceph_zero_page_vector_range(zoff, zlen,
- osd_data->pages);
+ iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
+ osd_data->num_bvecs,
+ osd_data->bvec_pos.iter.bi_size);
+ iov_iter_advance(&i, rc);
+ iov_iter_zero(zlen, &i);
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty);
+ put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
+ aio_req->should_dirty);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -839,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_vino vino;
struct ceph_osd_request *req;
- struct page **pages;
+ struct bio_vec *bvecs;
struct ceph_aio_request *aio_req = NULL;
int num_pages = 0;
int flags;
@@ -874,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
while (iov_iter_count(iter) > 0) {
- u64 size = dio_get_pagev_size(iter);
- size_t start = 0;
+ u64 size = iov_iter_count(iter);
ssize_t len;
+ if (write)
+ size = min_t(u64, size, fsc->mount_options->wsize);
+ else
+ size = min_t(u64, size, fsc->mount_options->rsize);
+
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &size, 0,
@@ -893,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
break;
}
- if (write)
- size = min_t(u64, size, fsc->mount_options->wsize);
- else
- size = min_t(u64, size, fsc->mount_options->rsize);
-
- len = size;
- pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
- if (IS_ERR(pages)) {
+ len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
+ if (len < 0) {
ceph_osdc_put_request(req);
- ret = PTR_ERR(pages);
+ ret = len;
break;
}
+ if (len != size)
+ osd_req_op_extent_update(req, 0, len);
/*
* To simplify error handling, allow AIO when IO within i_size
@@ -937,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req->r_mtime = mtime;
}
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
- false, false);
+ osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
if (aio_req) {
aio_req->total_len += len;
@@ -951,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
pos += len;
- iov_iter_advance(iter, len);
continue;
}
@@ -964,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (ret == -ENOENT)
ret = 0;
if (ret >= 0 && ret < len && pos + ret < size) {
+ struct iov_iter i;
int zlen = min_t(size_t, len - ret,
size - pos - ret);
- ceph_zero_page_vector_range(start + ret, zlen,
- pages);
+
+ iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
+ len);
+ iov_iter_advance(&i, ret);
+ iov_iter_zero(zlen, &i);
ret += zlen;
}
if (ret >= 0)
len = ret;
}
- ceph_put_page_vector(pages, num_pages, should_dirty);
-
+ put_bvecs(bvecs, num_pages, should_dirty);
ceph_osdc_put_request(req);
if (ret < 0)
break;
pos += len;
- iov_iter_advance(iter, len);
-
if (!write && pos >= size)
break;
@@ -1338,6 +1417,11 @@ retry_snap:
pos = iocb->ki_pos;
count = iov_iter_count(from);
+ if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
err = file_remove_privs(file);
if (err)
goto out;
@@ -1419,6 +1503,7 @@ retry_snap:
if (written >= 0) {
int dirty;
+
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
@@ -1426,6 +1511,8 @@ retry_snap:
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1668,6 +1755,12 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
+ if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
+ ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
+ ret = -EDQUOT;
+ goto unlock;
+ }
+
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
!(mode & FALLOC_FL_PUNCH_HOLE)) {
ret = -ENOSPC;
@@ -1716,6 +1809,9 @@ static long ceph_fallocate(struct file *file, int mode,
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ if ((endoff > size) &&
+ ceph_quota_is_max_bytes_approaching(inode, endoff))
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
ceph_put_cap_refs(ci, got);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c6ec5aa46100..ae056927080d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -441,6 +441,9 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL;
+ ci->i_max_bytes = 0;
+ ci->i_max_files = 0;
+
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
@@ -536,6 +539,9 @@ void ceph_destroy_inode(struct inode *inode)
ceph_queue_caps_release(inode);
+ if (__ceph_has_any_quota(ci))
+ ceph_adjust_quota_realms_count(inode, false);
+
/*
* we may still have a snap_realm reference if there are stray
* caps in i_snap_caps.
@@ -548,6 +554,9 @@ void ceph_destroy_inode(struct inode *inode)
dout(" dropping residual ref to snap realm %p\n", realm);
spin_lock(&realm->inodes_with_caps_lock);
list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm = NULL;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = NULL;
spin_unlock(&realm->inodes_with_caps_lock);
ceph_put_snap_realm(mdsc, realm);
}
@@ -660,13 +669,15 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_FILE_BUFFER|
CEPH_CAP_AUTH_EXCL|
CEPH_CAP_XATTR_EXCL)) {
- if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+ if (ci->i_version == 0 ||
+ timespec_compare(ctime, &inode->i_ctime) > 0) {
dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
ctime->tv_sec, ctime->tv_nsec);
inode->i_ctime = *ctime;
}
- if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+ if (ci->i_version == 0 ||
+ ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
/* the MDS did a utimes() */
dout("mtime %ld.%09ld -> %ld.%09ld "
"tw %d -> %d\n",
@@ -786,10 +797,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
new_issued = ~issued & le32_to_cpu(info->cap.caps);
/* update inode */
- ci->i_version = le64_to_cpu(info->version);
inode->i_rdev = le32_to_cpu(info->rdev);
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
+
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(info->mode);
@@ -857,6 +869,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
xattr_blob = NULL;
}
+ /* finally update i_version */
+ ci->i_version = le64_to_cpu(info->version);
+
inode->i_mapping->a_ops = &ceph_aops;
switch (inode->i_mode & S_IFMT) {
@@ -1867,20 +1882,9 @@ retry:
* possibly truncate them.. so write AND block!
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
- struct ceph_cap_snap *capsnap;
- to = ci->i_truncate_size;
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- // MDS should have revoked Frw caps
- WARN_ON_ONCE(capsnap->writing);
- if (capsnap->dirty_pages && capsnap->size > to)
- to = capsnap->size;
- }
spin_unlock(&ci->i_ceph_lock);
dout("__do_pending_vmtruncate %p flushing snaps first\n",
inode);
-
- truncate_pagecache(inode, to);
-
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
@@ -2152,6 +2156,10 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
+ return -EDQUOT;
+
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 851aa69ec8f0..c90f03beb15d 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -5,7 +5,7 @@
#include "super.h"
#include "mds_client.h"
#include "ioctl.h"
-
+#include <linux/ceph/striper.h>
/*
* ioctls
@@ -185,7 +185,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
CEPH_DEFINE_OID_ONSTACK(oid);
- u64 len = 1, olen;
+ u32 xlen;
u64 tmp;
struct ceph_pg pgid;
int r;
@@ -195,13 +195,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EFAULT;
down_read(&osdc->lock);
- r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
- &dl.object_no, &dl.object_offset,
- &olen);
- if (r < 0) {
- up_read(&osdc->lock);
- return -EIO;
- }
+ ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1,
+ &dl.object_no, &dl.object_offset, &xlen);
dl.file_offset -= dl.object_offset;
dl.object_size = ci->i_layout.object_size;
dl.block_size = ci->i_layout.stripe_unit;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 9e66f69ee8a5..9dae2ec7e1fa 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -95,7 +95,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
owner = secure_addr(fl->fl_owner);
dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
- "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+ "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
(int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
wait, fl->fl_type);
@@ -132,7 +132,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
}
ceph_mdsc_put_request(req);
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
+ "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
(int)operation, (u64)fl->fl_pid, fl->fl_start,
length, wait, fl->fl_type, err);
return err;
@@ -226,7 +226,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
return -ENOLCK;
- dout("ceph_lock, fl_owner: %p", fl->fl_owner);
+ dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
if (IS_GETLK(cmd))
@@ -264,7 +264,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
if (op == CEPH_MDS_OP_SETFILELOCK) {
- dout("mds locked, locking locally");
+ dout("mds locked, locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
/* undo! This should only happen if
@@ -272,7 +272,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
* deadlock. */
ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on posix_lock_file, undid lock",
+ dout("got %d on posix_lock_file, undid lock\n",
err);
}
}
@@ -294,7 +294,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
if (fl->fl_type & LOCK_MAND)
return -EOPNOTSUPP;
- dout("ceph_flock, fl_file: %p", fl->fl_file);
+ dout("ceph_flock, fl_file: %p\n", fl->fl_file);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -329,7 +329,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
inode, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on locks_lock_file_wait, undid lock", err);
+ dout("got %d on locks_lock_file_wait, undid lock\n", err);
}
}
return err;
@@ -356,7 +356,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
++(*flock_count);
spin_unlock(&ctx->flc_lock);
}
- dout("counted %d flock locks and %d fcntl locks",
+ dout("counted %d flock locks and %d fcntl locks\n",
*flock_count, *fcntl_count);
}
@@ -384,7 +384,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
cephlock->type = CEPH_LOCK_UNLOCK;
break;
default:
- dout("Have unknown lock type %d", lock->fl_type);
+ dout("Have unknown lock type %d\n", lock->fl_type);
err = -EINVAL;
}
@@ -407,7 +407,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
int seen_flock = 0;
int l = 0;
- dout("encoding %d flock and %d fcntl locks", num_flock_locks,
+ dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
num_fcntl_locks);
if (!ctx)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2e8f90f96540..5ece2e6ad154 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,26 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ /*
+ * both struct_v and struct_compat are expected to be >= 1
+ */
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ if (!struct_v || !struct_compat)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
info->pool_ns_len = 0;
info->pool_ns_data = NULL;
if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
@@ -384,7 +404,7 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
return s;
} else {
- dout("mdsc get_session %p 0 -- FAIL", s);
+ dout("mdsc get_session %p 0 -- FAIL\n", s);
return NULL;
}
}
@@ -419,9 +439,10 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
static bool __have_session(struct ceph_mds_client *mdsc, int mds)
{
- if (mds >= mdsc->max_sessions)
+ if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return false;
- return mdsc->sessions[mds];
+ else
+ return true;
}
static int __verify_registered_session(struct ceph_mds_client *mdsc,
@@ -448,6 +469,25 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s)
return ERR_PTR(-ENOMEM);
+
+ if (mds >= mdsc->max_sessions) {
+ int newmax = 1 << get_count_order(mds + 1);
+ struct ceph_mds_session **sa;
+
+ dout("%s: realloc to %d\n", __func__, newmax);
+ sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+ if (!sa)
+ goto fail_realloc;
+ if (mdsc->sessions) {
+ memcpy(sa, mdsc->sessions,
+ mdsc->max_sessions * sizeof(void *));
+ kfree(mdsc->sessions);
+ }
+ mdsc->sessions = sa;
+ mdsc->max_sessions = newmax;
+ }
+
+ dout("%s: mds%d\n", __func__, mds);
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
@@ -476,23 +516,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_flushing);
- dout("register_session mds%d\n", mds);
- if (mds >= mdsc->max_sessions) {
- int newmax = 1 << get_count_order(mds+1);
- struct ceph_mds_session **sa;
-
- dout("register_session realloc to %d\n", newmax);
- sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
- if (!sa)
- goto fail_realloc;
- if (mdsc->sessions) {
- memcpy(sa, mdsc->sessions,
- mdsc->max_sessions * sizeof(void *));
- kfree(mdsc->sessions);
- }
- mdsc->sessions = sa;
- mdsc->max_sessions = newmax;
- }
mdsc->sessions[mds] = s;
atomic_inc(&mdsc->num_sessions);
refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
@@ -2531,10 +2554,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
* Otherwise we just have to return an ESTALE
*/
if (result == -ESTALE) {
- dout("got ESTALE on request %llu", req->r_tid);
+ dout("got ESTALE on request %llu\n", req->r_tid);
req->r_resend_mds = -1;
if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now");
+ dout("not using auth, setting for that now\n");
req->r_direct_mode = USE_AUTH_MDS;
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
@@ -2542,13 +2565,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
int mds = __choose_mds(mdsc, req);
if (mds >= 0 && mds != req->r_session->s_mds) {
- dout("but auth changed, so resending");
+ dout("but auth changed, so resending\n");
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
}
}
- dout("have to return ESTALE on request %llu", req->r_tid);
+ dout("have to return ESTALE on request %llu\n", req->r_tid);
}
@@ -3470,13 +3493,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
}
/*
- * drop all leases (and dentry refs) in preparation for umount
+ * lock unlock sessions, to wait ongoing session activities
*/
-static void drop_leases(struct ceph_mds_client *mdsc)
+static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
{
int i;
- dout("drop_leases\n");
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
@@ -3572,7 +3594,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
if (!mdsc)
return -ENOMEM;
mdsc->fsc = fsc;
- fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
if (!mdsc->mdsmap) {
@@ -3580,6 +3601,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
return -ENOMEM;
}
+ fsc->mdsc = mdsc;
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -3587,6 +3609,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
atomic_set(&mdsc->num_sessions, 0);
mdsc->max_sessions = 0;
mdsc->stopping = 0;
+ atomic64_set(&mdsc->quotarealms_count, 0);
mdsc->last_snap_seq = 0;
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
@@ -3660,7 +3683,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
dout("pre_umount\n");
mdsc->stopping = 1;
- drop_leases(mdsc);
+ lock_unlock_sessions(mdsc);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
@@ -3858,6 +3881,9 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
struct ceph_mds_client *mdsc = fsc->mdsc;
dout("mdsc_destroy %p\n", mdsc);
+ if (!mdsc)
+ return;
+
/* flush out any connection work with references to us */
ceph_msgr_flush();
@@ -4077,6 +4103,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
case CEPH_MSG_CLIENT_LEASE:
handle_lease(mdsc, s, msg);
break;
+ case CEPH_MSG_CLIENT_QUOTA:
+ ceph_handle_quota(mdsc, s, msg);
+ break;
default:
pr_err("received unknown message type %d %s\n", type,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 71e3b783ee6f..2ec3b5b35067 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -49,6 +49,8 @@ struct ceph_mds_reply_info_in {
char *inline_data;
u32 pool_ns_len;
char *pool_ns_data;
+ u64 max_bytes;
+ u64 max_files;
};
struct ceph_mds_reply_dir_entry {
@@ -312,6 +314,8 @@ struct ceph_mds_client {
int max_sessions; /* len of s_mds_sessions */
int stopping; /* true if shutting down */
+ atomic64_t quotarealms_count; /* # realms with quota */
+
/*
* snap_rwsem will cover cap linkage into snaprealms, and
* realm snap contexts. (later, we can do per-realm snap
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
new file mode 100644
index 000000000000..242bfa5c0539
--- /dev/null
+++ b/fs/ceph/quota.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * quota.c - CephFS quota
+ *
+ * Copyright (C) 2017-2018 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/statfs.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ if (inc)
+ atomic64_inc(&mdsc->quotarealms_count);
+ else
+ atomic64_dec(&mdsc->quotarealms_count);
+}
+
+static inline bool ceph_has_realms_with_quotas(struct inode *inode)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ return atomic64_read(&mdsc->quotarealms_count) > 0;
+}
+
+void ceph_handle_quota(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->fsc->sb;
+ struct ceph_mds_quota *h = msg->front.iov_base;
+ struct ceph_vino vino;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+
+ if (msg->front.iov_len != sizeof(*h)) {
+ pr_err("%s corrupt message mds%d len %d\n", __func__,
+ session->s_mds, (int)msg->front.iov_len);
+ ceph_msg_dump(msg);
+ return;
+ }
+
+ /* increment msg sequence number */
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ mutex_unlock(&session->s_mutex);
+
+ /* lookup inode */
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode) {
+ pr_warn("Failed to find inode %llu\n", vino.ino);
+ return;
+ }
+ ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_rbytes = le64_to_cpu(h->rbytes);
+ ci->i_rfiles = le64_to_cpu(h->rfiles);
+ ci->i_rsubdirs = le64_to_cpu(h->rsubdirs);
+ __ceph_update_quota(ci, le64_to_cpu(h->max_bytes),
+ le64_to_cpu(h->max_files));
+ spin_unlock(&ci->i_ceph_lock);
+
+ iput(inode);
+}
+
+/*
+ * This function walks through the snaprealm for an inode and returns the
+ * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
+ * or max_bytes). If the root is reached, return the root ceph_snap_realm
+ * instead.
+ *
+ * Note that the caller is responsible for calling ceph_put_snap_realm() on the
+ * returned realm.
+ */
+static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
+ struct inode *inode)
+{
+ struct ceph_inode_info *ci = NULL;
+ struct ceph_snap_realm *realm, *next;
+ struct inode *in;
+ bool has_quota;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return NULL;
+
+ realm = ceph_inode(inode)->i_snap_realm;
+ if (realm)
+ ceph_get_snap_realm(mdsc, realm);
+ else
+ pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
+ "null i_snap_realm\n", ceph_vinop(inode));
+ while (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (!in)
+ break;
+
+ ci = ceph_inode(in);
+ has_quota = __ceph_has_any_quota(ci);
+ iput(in);
+
+ next = realm->parent;
+ if (has_quota || !next)
+ return realm;
+
+ ceph_get_snap_realm(mdsc, next);
+ ceph_put_snap_realm(mdsc, realm);
+ realm = next;
+ }
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+
+ return NULL;
+}
+
+bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
+ struct ceph_snap_realm *old_realm, *new_realm;
+ bool is_same;
+
+ down_read(&mdsc->snap_rwsem);
+ old_realm = get_quota_realm(mdsc, old);
+ new_realm = get_quota_realm(mdsc, new);
+ is_same = (old_realm == new_realm);
+ up_read(&mdsc->snap_rwsem);
+
+ if (old_realm)
+ ceph_put_snap_realm(mdsc, old_realm);
+ if (new_realm)
+ ceph_put_snap_realm(mdsc, new_realm);
+
+ return is_same;
+}
+
+enum quota_check_op {
+ QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */
+ QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */
+ QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files
+ limit is approaching */
+};
+
+/*
+ * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each
+ * realm, it will execute quota check operation defined by the 'op' parameter.
+ * The snaprealm walk is interrupted if the quota check detects that the quota
+ * is exceeded or if the root inode is reached.
+ */
+static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
+ loff_t delta)
+{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *realm, *next;
+ struct inode *in;
+ u64 max, rvalue;
+ bool exceeded = false;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return false;
+
+ down_read(&mdsc->snap_rwsem);
+ realm = ceph_inode(inode)->i_snap_realm;
+ if (realm)
+ ceph_get_snap_realm(mdsc, realm);
+ else
+ pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
+ "null i_snap_realm\n", ceph_vinop(inode));
+ while (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (!in)
+ break;
+
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ if (op == QUOTA_CHECK_MAX_FILES_OP) {
+ max = ci->i_max_files;
+ rvalue = ci->i_rfiles + ci->i_rsubdirs;
+ } else {
+ max = ci->i_max_bytes;
+ rvalue = ci->i_rbytes;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ switch (op) {
+ case QUOTA_CHECK_MAX_FILES_OP:
+ exceeded = (max && (rvalue >= max));
+ break;
+ case QUOTA_CHECK_MAX_BYTES_OP:
+ exceeded = (max && (rvalue + delta > max));
+ break;
+ case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP:
+ if (max) {
+ if (rvalue >= max)
+ exceeded = true;
+ else {
+ /*
+ * when we're writing more that 1/16th
+ * of the available space
+ */
+ exceeded =
+ (((max - rvalue) >> 4) < delta);
+ }
+ }
+ break;
+ default:
+ /* Shouldn't happen */
+ pr_warn("Invalid quota check op (%d)\n", op);
+ exceeded = true; /* Just break the loop */
+ }
+ iput(in);
+
+ next = realm->parent;
+ if (exceeded || !next)
+ break;
+ ceph_get_snap_realm(mdsc, next);
+ ceph_put_snap_realm(mdsc, realm);
+ realm = next;
+ }
+ ceph_put_snap_realm(mdsc, realm);
+ up_read(&mdsc->snap_rwsem);
+
+ return exceeded;
+}
+
+/*
+ * ceph_quota_is_max_files_exceeded - check if we can create a new file
+ * @inode: directory where a new file is being created
+ *
+ * This functions returns true is max_files quota allows a new file to be
+ * created. It is necessary to walk through the snaprealm hierarchy (until the
+ * FS root) to check all realms with quotas set.
+ */
+bool ceph_quota_is_max_files_exceeded(struct inode *inode)
+{
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ WARN_ON(!S_ISDIR(inode->i_mode));
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0);
+}
+
+/*
+ * ceph_quota_is_max_bytes_exceeded - check if we can write to a file
+ * @inode: inode being written
+ * @newsize: new size if write succeeds
+ *
+ * This functions returns true is max_bytes quota allows a file size to reach
+ * @newsize; it returns false otherwise.
+ */
+bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize)
+{
+ loff_t size = i_size_read(inode);
+
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ /* return immediately if we're decreasing file size */
+ if (newsize <= size)
+ return false;
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size));
+}
+
+/*
+ * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes
+ * @inode: inode being written
+ * @newsize: new size if write succeeds
+ *
+ * This function returns true if the new file size @newsize will be consuming
+ * more than 1/16th of the available quota space; it returns false otherwise.
+ */
+bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize)
+{
+ loff_t size = ceph_inode(inode)->i_reported_size;
+
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ /* return immediately if we're decreasing file size */
+ if (newsize <= size)
+ return false;
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP,
+ (newsize - size));
+}
+
+/*
+ * ceph_quota_update_statfs - if root has quota update statfs with quota status
+ * @fsc: filesystem client instance
+ * @buf: statfs to update
+ *
+ * If the mounted filesystem root has max_bytes quota set, update the filesystem
+ * statistics with the quota status.
+ *
+ * This function returns true if the stats have been updated, false otherwise.
+ */
+bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *realm;
+ struct inode *in;
+ u64 total = 0, used, free;
+ bool is_updated = false;
+
+ down_read(&mdsc->snap_rwsem);
+ realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
+ up_read(&mdsc->snap_rwsem);
+ if (!realm)
+ return false;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (in) {
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_max_bytes) {
+ total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
+ /* It is possible for a quota to be exceeded.
+ * Report 'zero' in that case
+ */
+ free = total > used ? total - used : 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (total) {
+ buf->f_blocks = total;
+ buf->f_bfree = free;
+ buf->f_bavail = free;
+ is_updated = true;
+ }
+ iput(in);
+ }
+ ceph_put_snap_realm(mdsc, realm);
+
+ return is_updated;
+}
+
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 07cf95e6413d..041c27ea8de1 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -931,6 +931,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
list_add(&ci->i_snap_realm_item,
&realm->inodes_with_caps);
ci->i_snap_realm = realm;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
spin_unlock(&realm->inodes_with_caps_lock);
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fb2bc9c15a23..b33082e6878f 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -76,9 +76,18 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
- buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+ /*
+ * By default use root quota for stats; fallback to overall filesystem
+ * usage if using 'noquotadf' mount option or if the root dir doesn't
+ * have max_bytes quota set.
+ */
+ if (ceph_test_mount_opt(fsc, NOQUOTADF) ||
+ !ceph_quota_update_statfs(fsc, buf)) {
+ buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ }
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
@@ -151,6 +160,8 @@ enum {
Opt_acl,
#endif
Opt_noacl,
+ Opt_quotadf,
+ Opt_noquotadf,
};
static match_table_t fsopt_tokens = {
@@ -187,6 +198,8 @@ static match_table_t fsopt_tokens = {
{Opt_acl, "acl"},
#endif
{Opt_noacl, "noacl"},
+ {Opt_quotadf, "quotadf"},
+ {Opt_noquotadf, "noquotadf"},
{-1, NULL}
};
@@ -314,13 +327,16 @@ static int parse_fsopt_token(char *c, void *private)
break;
case Opt_fscache:
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ kfree(fsopt->fscache_uniq);
+ fsopt->fscache_uniq = NULL;
break;
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+ kfree(fsopt->fscache_uniq);
+ fsopt->fscache_uniq = NULL;
break;
case Opt_poolperm:
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
- printk ("pool perm");
break;
case Opt_nopoolperm:
fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
@@ -331,6 +347,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_norequire_active_mds:
fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
break;
+ case Opt_quotadf:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF;
+ break;
+ case Opt_noquotadf:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
+ break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= SB_POSIXACL;
@@ -513,13 +535,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
seq_puts(m, ",nodcache");
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
- if (fsopt->fscache_uniq)
- seq_printf(m, ",fsc=%s", fsopt->fscache_uniq);
- else
- seq_puts(m, ",fsc");
+ seq_show_option(m, "fsc", fsopt->fscache_uniq);
}
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
seq_puts(m, ",nopoolperm");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF)
+ seq_puts(m, ",noquotadf");
#ifdef CONFIG_CEPH_FS_POSIX_ACL
if (fsopt->sb_flags & SB_POSIXACL)
@@ -529,7 +550,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
#endif
if (fsopt->mds_namespace)
- seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
+ seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -679,6 +700,7 @@ struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
+struct kmem_cache *ceph_dir_file_cachep;
static void ceph_inode_init_once(void *foo)
{
@@ -698,8 +720,7 @@ static int __init init_caches(void)
if (!ceph_inode_cachep)
return -ENOMEM;
- ceph_cap_cachep = KMEM_CACHE(ceph_cap,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
@@ -716,6 +737,10 @@ static int __init init_caches(void)
if (!ceph_file_cachep)
goto bad_file;
+ ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD);
+ if (!ceph_dir_file_cachep)
+ goto bad_dir_file;
+
error = ceph_fscache_register();
if (error)
goto bad_fscache;
@@ -723,6 +748,8 @@ static int __init init_caches(void)
return 0;
bad_fscache:
+ kmem_cache_destroy(ceph_dir_file_cachep);
+bad_dir_file:
kmem_cache_destroy(ceph_file_cachep);
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
@@ -748,6 +775,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
+ kmem_cache_destroy(ceph_dir_file_cachep);
ceph_fscache_unregister();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1c2086e0fec2..a7077a0c989f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -39,6 +39,7 @@
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
+#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
@@ -310,6 +311,9 @@ struct ceph_inode_info {
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
+ /* quotas */
+ u64 i_max_bytes, i_max_files;
+
struct rb_root i_fragtree;
int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
@@ -671,6 +675,10 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock;
struct list_head rw_contexts;
+};
+
+struct ceph_dir_file_info {
+ struct ceph_file_info file_info;
/* readdir: position within the dir */
u32 frag;
@@ -748,6 +756,7 @@ struct ceph_readdir_cache_control {
*/
struct ceph_snap_realm {
u64 ino;
+ struct inode *inode;
atomic_t nref;
struct rb_node node;
@@ -1066,4 +1075,37 @@ extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+/* quota.c */
+static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
+{
+ return ci->i_max_files || ci->i_max_bytes;
+}
+
+extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
+
+static inline void __ceph_update_quota(struct ceph_inode_info *ci,
+ u64 max_bytes, u64 max_files)
+{
+ bool had_quota, has_quota;
+ had_quota = __ceph_has_any_quota(ci);
+ ci->i_max_bytes = max_bytes;
+ ci->i_max_files = max_files;
+ has_quota = __ceph_has_any_quota(ci);
+
+ if (had_quota != has_quota)
+ ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+}
+
+extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern bool ceph_quota_is_max_files_exceeded(struct inode *inode);
+extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new);
+extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode,
+ loff_t newlen);
+extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
+ loff_t newlen);
+extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
+ struct kstatfs *buf);
+
#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index e1c4e0b12b4c..315f7e63e7cc 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -224,6 +224,39 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
(long)ci->i_rctime.tv_nsec);
}
+/* quotas */
+
+static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
+{
+ bool ret = false;
+ spin_lock(&ci->i_ceph_lock);
+ if ((ci->i_max_files || ci->i_max_bytes) &&
+ ci->i_vino.snap == CEPH_NOSNAP &&
+ ci->i_snap_realm &&
+ ci->i_snap_realm->ino == ci->i_vino.ino)
+ ret = true;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
+static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return snprintf(val, size, "max_bytes=%llu max_files=%llu",
+ ci->i_max_bytes, ci->i_max_files);
+}
+
+static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%llu", ci->i_max_bytes);
+}
+
+static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return snprintf(val, size, "%llu", ci->i_max_files);
+}
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
#define CEPH_XATTR_NAME2(_type, _name, _name2) \
@@ -247,6 +280,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
.hidden = true, \
.exists_cb = ceph_vxattrcb_layout_exists, \
}
+#define XATTR_QUOTA_FIELD(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .readonly = false, \
+ .hidden = true, \
+ .exists_cb = ceph_vxattrcb_quota_exists, \
+ }
static struct ceph_vxattr ceph_dir_vxattrs[] = {
{
@@ -270,6 +312,16 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_NAME_CEPH(dir, rsubdirs),
XATTR_NAME_CEPH(dir, rbytes),
XATTR_NAME_CEPH(dir, rctime),
+ {
+ .name = "ceph.quota",
+ .name_size = sizeof("ceph.quota"),
+ .getxattr_cb = ceph_vxattrcb_quota,
+ .readonly = false,
+ .hidden = true,
+ .exists_cb = ceph_vxattrcb_quota_exists,
+ },
+ XATTR_QUOTA_FIELD(quota, max_bytes),
+ XATTR_QUOTA_FIELD(quota, max_files),
{ .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
@@ -964,14 +1016,19 @@ int __ceph_setxattr(struct inode *inode, const char *name,
char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL;
int required_blob_size;
+ bool check_realm = false;
bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
+ if (vxattr) {
+ if (vxattr->readonly)
+ return -EOPNOTSUPP;
+ if (value && !strncmp(vxattr->name, "ceph.quota", 10))
+ check_realm = true;
+ }
/* pass any unhandled ceph.* xattrs through to the MDS */
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
@@ -1065,6 +1122,15 @@ do_sync_unlocked:
err = -EBUSY;
} else {
err = ceph_sync_setxattr(inode, name, value, size, flags);
+ if (err >= 0 && check_realm) {
+ /* check if snaprealm was created for quota inode */
+ spin_lock(&ci->i_ceph_lock);
+ if ((ci->i_max_files || ci->i_max_bytes) &&
+ !(ci->i_snap_realm &&
+ ci->i_snap_realm->ino == ci->i_vino.ino))
+ err = -EOPNOTSUPP;
+ spin_unlock(&ci->i_ceph_lock);
+ }
}
out:
ceph_free_cap_flush(prealloc_cf);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 741749a98614..5f132d59dfc2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -197,7 +197,7 @@ config CIFS_SMB311
config CIFS_SMB_DIRECT
bool "SMB Direct support (Experimental)"
- depends on CIFS=m && INFINIBAND || CIFS=y && INFINIBAND=y
+ depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
help
Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 7e4a1e2f0696..85817991ee68 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -1,11 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
#
-# Makefile for Linux CIFS VFS client
+# Makefile for Linux CIFS/SMB2/SMB3 VFS client
#
+ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_CIFS) += cifs.o
-cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
- link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
+ inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 2c14020e5e1d..edf5f40898bf 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -46,67 +46,11 @@ void cifs_fscache_unregister(void)
}
/*
- * Key layout of CIFS server cache index object
- */
-struct cifs_server_key {
- uint16_t family; /* address family */
- __be16 port; /* IP port */
- union {
- struct in_addr ipv4_addr;
- struct in6_addr ipv6_addr;
- } addr[0];
-};
-
-/*
- * Server object keyed by {IPaddress,port,family} tuple
- */
-static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct TCP_Server_Info *server = cookie_netfs_data;
- const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
- const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
- const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
- struct cifs_server_key *key = buffer;
- uint16_t key_len = sizeof(struct cifs_server_key);
-
- memset(key, 0, key_len);
-
- /*
- * Should not be a problem as sin_family/sin6_family overlays
- * sa_family field
- */
- switch (sa->sa_family) {
- case AF_INET:
- key->family = sa->sa_family;
- key->port = addr->sin_port;
- key->addr[0].ipv4_addr = addr->sin_addr;
- key_len += sizeof(key->addr[0].ipv4_addr);
- break;
-
- case AF_INET6:
- key->family = sa->sa_family;
- key->port = addr6->sin6_port;
- key->addr[0].ipv6_addr = addr6->sin6_addr;
- key_len += sizeof(key->addr[0].ipv6_addr);
- break;
-
- default:
- cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
- key_len = 0;
- break;
- }
-
- return key_len;
-}
-
-/*
* Server object for FS-Cache
*/
const struct fscache_cookie_def cifs_fscache_server_index_def = {
.name = "CIFS.server",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = cifs_server_get_key,
};
/*
@@ -116,7 +60,7 @@ struct cifs_fscache_super_auxdata {
u64 resource_id; /* unique server resource id */
};
-static char *extract_sharename(const char *treename)
+char *extract_sharename(const char *treename)
{
const char *src;
char *delim, *dst;
@@ -140,56 +84,11 @@ static char *extract_sharename(const char *treename)
return dst;
}
-/*
- * Superblock object currently keyed by share name
- */
-static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
- uint16_t maxbuf)
-{
- const struct cifs_tcon *tcon = cookie_netfs_data;
- char *sharename;
- uint16_t len;
-
- sharename = extract_sharename(tcon->treeName);
- if (IS_ERR(sharename)) {
- cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
- sharename = NULL;
- return 0;
- }
-
- len = strlen(sharename);
- if (len > maxbuf)
- return 0;
-
- memcpy(buffer, sharename, len);
-
- kfree(sharename);
-
- return len;
-}
-
-static uint16_t
-cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
- uint16_t maxbuf)
-{
- struct cifs_fscache_super_auxdata auxdata;
- const struct cifs_tcon *tcon = cookie_netfs_data;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.resource_id = tcon->resource_id;
-
- if (maxbuf > sizeof(auxdata))
- maxbuf = sizeof(auxdata);
-
- memcpy(buffer, &auxdata, maxbuf);
-
- return maxbuf;
-}
-
static enum
fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct cifs_fscache_super_auxdata auxdata;
const struct cifs_tcon *tcon = cookie_netfs_data;
@@ -212,68 +111,14 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
const struct fscache_cookie_def cifs_fscache_super_index_def = {
.name = "CIFS.super",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = cifs_super_get_key,
- .get_aux = cifs_fscache_super_get_aux,
.check_aux = cifs_fscache_super_check_aux,
};
-/*
- * Auxiliary data attached to CIFS inode within the cache
- */
-struct cifs_fscache_inode_auxdata {
- struct timespec last_write_time;
- struct timespec last_change_time;
- u64 eof;
-};
-
-static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t maxbuf)
-{
- const struct cifsInodeInfo *cifsi = cookie_netfs_data;
- uint16_t keylen;
-
- /* use the UniqueId as the key */
- keylen = sizeof(cifsi->uniqueid);
- if (keylen > maxbuf)
- keylen = 0;
- else
- memcpy(buffer, &cifsi->uniqueid, keylen);
-
- return keylen;
-}
-
-static void
-cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
-{
- const struct cifsInodeInfo *cifsi = cookie_netfs_data;
-
- *size = cifsi->vfs_inode.i_size;
-}
-
-static uint16_t
-cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
- uint16_t maxbuf)
-{
- struct cifs_fscache_inode_auxdata auxdata;
- const struct cifsInodeInfo *cifsi = cookie_netfs_data;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.eof = cifsi->server_eof;
- auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
- auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
-
- if (maxbuf > sizeof(auxdata))
- maxbuf = sizeof(auxdata);
-
- memcpy(buffer, &auxdata, maxbuf);
-
- return maxbuf;
-}
-
static enum
fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct cifs_fscache_inode_auxdata auxdata;
struct cifsInodeInfo *cifsi = cookie_netfs_data;
@@ -295,8 +140,5 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
const struct fscache_cookie_def cifs_fscache_inode_object_def = {
.name = "CIFS.uniqueid",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = cifs_fscache_inode_get_key,
- .get_attr = cifs_fscache_inode_get_attr,
- .get_aux = cifs_fscache_inode_get_aux,
.check_aux = cifs_fscache_inode_check_aux,
};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index e35e711db68e..116146022aa1 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -42,24 +42,7 @@ cifs_dump_mem(char *label, void *data, int length)
data, length, true);
}
-#ifdef CONFIG_CIFS_DEBUG
-void cifs_vfs_err(const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- pr_err_ratelimited("CIFS VFS: %pV", &vaf);
-
- va_end(args);
-}
-#endif
-
-void cifs_dump_detail(void *buf)
+void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
struct smb_hdr *smb = (struct smb_hdr *)buf;
@@ -67,7 +50,8 @@ void cifs_dump_detail(void *buf)
cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
- cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
+ cifs_dbg(VFS, "smb buf %p len %u\n", smb,
+ server->ops->calc_smb_size(smb, server));
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -100,7 +84,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
mid_entry->multiRsp, mid_entry->multiEnd);
if (mid_entry->resp_buf) {
- cifs_dump_detail(mid_entry->resp_buf);
+ cifs_dump_detail(mid_entry->resp_buf, server);
cifs_dump_mem("existing buf: ",
mid_entry->resp_buf, 62);
}
@@ -130,6 +114,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, " type: %d ", dev_type);
if (tcon->seal)
seq_printf(m, " Encrypted");
+ if (tcon->nocase)
+ seq_printf(m, " nocase");
if (tcon->unix_ext)
seq_printf(m, " POSIX Extensions");
if (tcon->ses->server->ops->dump_share_caps)
@@ -254,6 +240,10 @@ skip_rdma:
server->credits, server->dialect);
if (server->sign)
seq_printf(m, " signed");
+#ifdef CONFIG_CIFS_SMB311
+ if (server->posix_ext_supported)
+ seq_printf(m, " posix");
+#endif /* 3.1.1 */
i++;
list_for_each(tmp2, &server->smb_ses_list) {
ses = list_entry(tmp2, struct cifs_ses,
@@ -331,18 +321,6 @@ skip_rdma:
return 0;
}
-static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, cifs_debug_data_proc_show, NULL);
-}
-
-static const struct file_operations cifs_debug_data_proc_fops = {
- .open = cifs_debug_data_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
#ifdef CONFIG_CIFS_STATS
static ssize_t cifs_stats_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
@@ -514,35 +492,36 @@ cifs_proc_init(void)
if (proc_fs_cifs == NULL)
return;
- proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
+ proc_create_single("DebugData", 0, proc_fs_cifs,
+ cifs_debug_data_proc_show);
#ifdef CONFIG_CIFS_STATS
- proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
+ proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops);
#endif /* STATS */
- proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
- proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
- proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+ proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops);
+ proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops);
+ proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs,
&cifs_linux_ext_proc_fops);
- proc_create("SecurityFlags", 0, proc_fs_cifs,
+ proc_create("SecurityFlags", 0644, proc_fs_cifs,
&cifs_security_flags_proc_fops);
- proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+ proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
&cifs_lookup_cache_proc_fops);
#ifdef CONFIG_CIFS_SMB_DIRECT
- proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs,
+ proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs,
&cifs_rdma_readwrite_threshold_proc_fops);
- proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs,
+ proc_create("smbd_max_frmr_depth", 0644, proc_fs_cifs,
&cifs_smbd_max_frmr_depth_proc_fops);
- proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs,
+ proc_create("smbd_keep_alive_interval", 0644, proc_fs_cifs,
&cifs_smbd_keep_alive_interval_proc_fops);
- proc_create("smbd_max_receive_size", 0, proc_fs_cifs,
+ proc_create("smbd_max_receive_size", 0644, proc_fs_cifs,
&cifs_smbd_max_receive_size_proc_fops);
- proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs,
+ proc_create("smbd_max_fragmented_recv_size", 0644, proc_fs_cifs,
&cifs_smbd_max_fragmented_recv_size_proc_fops);
- proc_create("smbd_max_send_size", 0, proc_fs_cifs,
+ proc_create("smbd_max_send_size", 0644, proc_fs_cifs,
&cifs_smbd_max_send_size_proc_fops);
- proc_create("smbd_send_credit_target", 0, proc_fs_cifs,
+ proc_create("smbd_send_credit_target", 0644, proc_fs_cifs,
&cifs_smbd_send_credit_target_proc_fops);
- proc_create("smbd_receive_credit_max", 0, proc_fs_cifs,
+ proc_create("smbd_receive_credit_max", 0644, proc_fs_cifs,
&cifs_smbd_receive_credit_max_proc_fops);
#endif
}
@@ -600,6 +579,8 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
cifsFYI = bv;
else if ((c[0] > '1') && (c[0] <= '9'))
cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
+ else
+ return -EINVAL;
return count;
}
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c611ca2339d7..f4f3f0853c6e 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -23,7 +23,7 @@
#define _H_CIFS_DEBUG
void cifs_dump_mem(char *label, void *data, int length);
-void cifs_dump_detail(void *);
+void cifs_dump_detail(void *buf, struct TCP_Server_Info *ptcp_info);
void cifs_dump_mids(struct TCP_Server_Info *);
extern bool traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
@@ -39,6 +39,7 @@ extern int cifsFYI;
#else
#define NOISY 0
#endif
+#define ONCE 8
/*
* debug ON
@@ -46,19 +47,28 @@ extern int cifsFYI;
*/
#ifdef CONFIG_CIFS_DEBUG
-__printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
-
/* information message: e.g., configuration, major event */
-#define cifs_dbg(type, fmt, ...) \
-do { \
- if (type == FYI && cifsFYI & CIFS_INFO) { \
- pr_debug_ratelimited("%s: " \
- fmt, __FILE__, ##__VA_ARGS__); \
- } else if (type == VFS) { \
- cifs_vfs_err(fmt, ##__VA_ARGS__); \
- } else if (type == NOISY && type != 0) { \
- pr_debug_ratelimited(fmt, ##__VA_ARGS__); \
- } \
+#define cifs_dbg_func(ratefunc, type, fmt, ...) \
+do { \
+ if ((type) & FYI && cifsFYI & CIFS_INFO) { \
+ pr_debug_ ## ratefunc("%s: " \
+ fmt, __FILE__, ##__VA_ARGS__); \
+ } else if ((type) & VFS) { \
+ pr_err_ ## ratefunc("CIFS VFS: " \
+ fmt, ##__VA_ARGS__); \
+ } else if ((type) & NOISY && (NOISY != 0)) { \
+ pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \
+ } \
+} while (0)
+
+#define cifs_dbg(type, fmt, ...) \
+do { \
+ if ((type) & ONCE) \
+ cifs_dbg_func(once, \
+ type, fmt, ##__VA_ARGS__); \
+ else \
+ cifs_dbg_func(ratelimited, \
+ type, fmt, ##__VA_ARGS__); \
} while (0)
/*
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 350fa55a1bf7..9731d0d891e7 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -50,6 +50,7 @@
* root mountable
*/
#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
+#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
struct cifs_sb_info {
struct rb_root tlink_tree;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f715609b13f3..eb7b6573f322 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -58,13 +58,15 @@ bool traceSMB;
bool enable_oplocks = true;
bool linuxExtEnabled = true;
bool lookupCacheEnabled = true;
+bool disable_legacy_dialects; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, uint, 0444);
-MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
+MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header) "
+ "for CIFS requests. "
"Default: 16384 Range: 8192 to 130048");
unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
module_param(cifs_min_rcv, uint, 0444);
@@ -76,11 +78,21 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
module_param(cifs_max_pending, uint, 0444);
-MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
+MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
+ "CIFS/SMB1 dialect (N/A for SMB3) "
"Default: 32767 Range: 2 to 32767.");
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
+module_param(disable_legacy_dialects, bool, 0644);
+MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
+ "helpful to restrict the ability to "
+ "override the default dialects (SMB2.1, "
+ "SMB3 and SMB3.02) on mount with old "
+ "dialects (CIFS/SMB1 and SMB2) since "
+ "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker"
+ " and less secure. Default: n/N/0");
+
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
@@ -469,10 +481,20 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",persistenthandles");
else if (tcon->use_resilient)
seq_puts(s, ",resilienthandles");
+
+#ifdef CONFIG_CIFS_SMB311
+ if (tcon->posix_extensions)
+ seq_puts(s, ",posix");
+ else if (tcon->unix_ext)
+ seq_puts(s, ",unix");
+ else
+ seq_puts(s, ",nounix");
+#else
if (tcon->unix_ext)
seq_puts(s, ",unix");
else
seq_puts(s, ",nounix");
+#endif /* SMB311 */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -495,6 +517,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",sfu");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
seq_puts(s, ",nobrl");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE)
+ seq_puts(s, ",nohandlecache");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
seq_puts(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
@@ -897,6 +921,17 @@ struct file_system_type cifs_fs_type = {
/* .fs_flags */
};
MODULE_ALIAS_FS("cifs");
+
+static struct file_system_type smb3_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "smb3",
+ .mount = cifs_do_mount,
+ .kill_sb = cifs_kill_sb,
+ /* .fs_flags */
+};
+MODULE_ALIAS_FS("smb3");
+MODULE_ALIAS("smb3");
+
const struct inode_operations cifs_dir_inode_ops = {
.create = cifs_create,
.atomic_open = cifs_atomic_open,
@@ -1047,6 +1082,18 @@ out:
return rc;
}
+/*
+ * Directory operations under CIFS/SMB2/SMB3 are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int cifs_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ cifs_dbg(FYI, "Sync directory - name: %pD datasync: 0x%x\n",
+ file, datasync);
+
+ return 0;
+}
+
static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff,
size_t len, unsigned int flags)
@@ -1181,6 +1228,7 @@ const struct file_operations cifs_dir_ops = {
.copy_file_range = cifs_copy_file_range,
.clone_file_range = cifs_clone_file_range,
.llseek = generic_file_llseek,
+ .fsync = cifs_dir_fsync,
};
static void
@@ -1422,6 +1470,12 @@ init_cifs(void)
if (rc)
goto out_init_cifs_idmap;
+ rc = register_filesystem(&smb3_fs_type);
+ if (rc) {
+ unregister_filesystem(&cifs_fs_type);
+ goto out_init_cifs_idmap;
+ }
+
return 0;
out_init_cifs_idmap:
@@ -1452,8 +1506,9 @@ out_clean_proc:
static void __exit
exit_cifs(void)
{
- cifs_dbg(NOISY, "exit_cifs\n");
+ cifs_dbg(NOISY, "exit_smb3\n");
unregister_filesystem(&cifs_fs_type);
+ unregister_filesystem(&smb3_fs_type);
cifs_dfs_release_automount_timer();
#ifdef CONFIG_CIFS_ACL
exit_cifs_idmap();
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 013ba2aed8d9..5f0231803431 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.11"
+#define CIFS_VERSION "2.12"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2282562e78a1..08d1cdd96701 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -176,6 +176,7 @@ struct smb_rqst {
struct kvec *rq_iov; /* array of kvecs */
unsigned int rq_nvec; /* number of kvecs in array */
struct page **rq_pages; /* pointer to array of page ptrs */
+ unsigned int rq_offset; /* the offset to the 1st page */
unsigned int rq_npages; /* number pages in array */
unsigned int rq_pagesz; /* page size to use */
unsigned int rq_tailsz; /* length of last page */
@@ -244,7 +245,7 @@ struct smb_version_operations {
int (*map_error)(char *, bool);
/* find mid corresponding to the response message */
struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
- void (*dump_detail)(void *);
+ void (*dump_detail)(void *buf, struct TCP_Server_Info *ptcp_info);
void (*clear_stats)(struct cifs_tcon *);
void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
@@ -372,7 +373,7 @@ struct smb_version_operations {
int (*close_dir)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *);
/* calculate a size of SMB message */
- unsigned int (*calc_smb_size)(void *);
+ unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
/* check for STATUS_PENDING and process it in a positive case */
bool (*is_status_pending)(char *, struct TCP_Server_Info *, int);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
@@ -417,7 +418,7 @@ struct smb_version_operations {
/* create lease context buffer for CREATE request */
char * (*create_lease_buf)(u8 *, u8);
/* parse lease context buffer and return oplock/epoch info */
- __u8 (*parse_lease_buf)(void *, unsigned int *);
+ __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey);
ssize_t (*copychunk_range)(const unsigned int,
struct cifsFileInfo *src_file,
struct cifsFileInfo *target_file,
@@ -457,7 +458,7 @@ struct smb_version_operations {
struct mid_q_entry **);
enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
enum securityEnum);
-
+ int (*next_header)(char *);
};
struct smb_version_values {
@@ -521,10 +522,12 @@ struct smb_vol {
bool sfu_remap:1; /* remap seven reserved chars ala SFU */
bool posix_paths:1; /* unset to not ask for posix pathnames. */
bool no_linux_ext:1;
+ bool linux_ext:1;
bool sfu_emul:1;
bool nullauth:1; /* attempt to authenticate with null user */
bool nocase:1; /* request case insensitive filenames */
bool nobrl:1; /* disable sending byte range locks to srv */
+ bool nohandlecache:1; /* disable caching dir handles if srvr probs */
bool mand_lock:1; /* send mandatory not posix byte range lock reqs */
bool seal:1; /* request transport encryption on share */
bool nodfs:1; /* Do not request DFS, even if available */
@@ -630,7 +633,7 @@ struct TCP_Server_Info {
bool oplocks:1; /* enable oplocks */
unsigned int maxReq; /* Clients should submit no more */
/* than maxReq distinct unanswered SMBs to the server when using */
- /* multiplexed reads or writes */
+ /* multiplexed reads or writes (for SMB1/CIFS only, not SMB2/SMB3) */
unsigned int maxBuf; /* maxBuf specifies the maximum */
/* message size the server can send or receive for non-raw SMBs */
/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
@@ -665,6 +668,8 @@ struct TCP_Server_Info {
struct delayed_work echo; /* echo ping workqueue job */
char *smallbuf; /* pointer to current "small" buffer */
char *bigbuf; /* pointer to current "big" buffer */
+ /* Total size of this PDU. Only valid from cifs_demultiplex_thread */
+ unsigned int pdu_size;
unsigned int total_read; /* total amount of data read in this pass */
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
@@ -676,8 +681,10 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
#ifdef CONFIG_CIFS_SMB311
+ __le16 cipher_type;
/* save initital negprot hash */
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+ bool posix_ext_supported;
#endif /* 3.1.1 */
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
@@ -950,9 +957,13 @@ struct cifs_tcon {
bool print:1; /* set if connection to printer share */
bool retry:1;
bool nocase:1;
+ bool nohandlecache:1; /* if strange server resource prob can turn off */
bool seal:1; /* transport encryption for this mounted share */
bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol
for this mount even if server would support */
+#ifdef CONFIG_CIFS_SMB311
+ bool posix_extensions; /* if true SMB3.11 posix extensions enabled */
+#endif /* CIFS_311 */
bool local_lease:1; /* check leases (only) on local system not remote */
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
@@ -976,6 +987,9 @@ struct cifs_tcon {
struct fscache_cookie *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
+ bool valid_root_fid:1; /* Do we have a useable root fid */
+ struct mutex prfid_mutex; /* prevents reopen race after dead ses*/
+ struct cifs_fid *prfid; /* handle to the directory at top of share */
/* BB add field for back pointer to sb struct(s)? */
};
@@ -1068,6 +1082,7 @@ struct cifs_open_parms {
int create_options;
const char *path;
struct cifs_fid *fid;
+ umode_t mode;
bool reconnect:1;
};
@@ -1166,10 +1181,11 @@ struct cifs_readdata {
struct smbd_mr *mr;
#endif
unsigned int pagesz;
+ unsigned int page_offset;
unsigned int tailsz;
unsigned int credits;
unsigned int nr_pages;
- struct page *pages[];
+ struct page **pages;
};
struct cifs_writedata;
@@ -1191,10 +1207,11 @@ struct cifs_writedata {
struct smbd_mr *mr;
#endif
unsigned int pagesz;
+ unsigned int page_offset;
unsigned int tailsz;
unsigned int credits;
unsigned int nr_pages;
- struct page *pages[];
+ struct page **pages;
};
/*
@@ -1373,6 +1390,7 @@ struct mid_q_entry {
mid_handle_t *handle; /* call handle mid callback */
void *callback_data; /* general purpose pointer for callback */
void *resp_buf; /* pointer to received SMB header */
+ unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
unsigned int mid_flags;
__le16 command; /* smb command code */
@@ -1688,16 +1706,17 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
-GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN bool lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
+extern bool enable_oplocks; /* enable or disable oplocks */
+extern bool lookupCacheEnabled;
+extern unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
-GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
-GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
-GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
-GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
-GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
+extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+extern unsigned int CIFSMaxBufSize; /* max size not including hdr */
+extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
+extern unsigned int cifs_min_small; /* min size of small buf pool */
+extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
#ifdef CONFIG_CIFS_ACL
GLOBAL_EXTERN struct rb_root uidtree;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 365a414a75e9..7933c5f9c076 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -21,6 +21,7 @@
#ifndef _CIFSPROTO_H
#define _CIFSPROTO_H
#include <linux/nls.h>
+#include "trace.h"
struct statfs;
struct smb_vol;
@@ -47,6 +48,7 @@ extern void _free_xid(unsigned int);
cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \
__func__, __xid, \
from_kuid(&init_user_ns, current_fsuid())); \
+ trace_smb3_enter(__xid, __func__); \
__xid; \
})
@@ -54,7 +56,11 @@ extern void _free_xid(unsigned int);
do { \
_free_xid(curr_xid); \
cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \
- __func__, curr_xid, (int)rc); \
+ __func__, curr_xid, (int)rc); \
+ if (rc) \
+ trace_smb3_exit_err(curr_xid, __func__, (int)rc); \
+ else \
+ trace_smb3_exit_done(curr_xid, __func__); \
} while (0)
extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
@@ -124,7 +130,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-extern unsigned int smbCalcSize(void *buf);
+extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -197,7 +203,9 @@ extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read);
extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
- struct page *page, unsigned int to_read);
+ struct page *page,
+ unsigned int page_offset,
+ unsigned int to_read);
extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb);
extern int cifs_match_super(struct super_block *, void *);
@@ -525,6 +533,8 @@ int cifs_async_writev(struct cifs_writedata *wdata,
void cifs_writev_complete(struct work_struct *work);
struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
work_func_t complete);
+struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages,
+ work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 59c09a596c0a..5aca336642c0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -106,6 +106,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
open_file->oplock_break_cancelled = true;
}
spin_unlock(&tcon->open_file_lock);
+
+ mutex_lock(&tcon->prfid_mutex);
+ tcon->valid_root_fid = false;
+ memset(tcon->prfid, 0, sizeof(struct cifs_fid));
+ mutex_unlock(&tcon->prfid_mutex);
+
/*
* BB Add call to invalidate_inodes(sb) for all superblocks mounted
* to this tcon.
@@ -206,8 +212,10 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
mutex_unlock(&ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
- if (rc)
+ if (rc) {
+ printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc);
goto out;
+ }
atomic_inc(&tconInfoReconnectCount);
@@ -453,6 +461,9 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
server->sign = true;
}
+ if (cifs_rdma_enabled(server) && server->sign)
+ cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled");
+
return 0;
}
@@ -1416,8 +1427,9 @@ openRetry:
int
cifs_discard_remaining_data(struct TCP_Server_Info *server)
{
- unsigned int rfclen = get_rfc1002_length(server->smallbuf);
- int remaining = rfclen + 4 - server->total_read;
+ unsigned int rfclen = server->pdu_size;
+ int remaining = rfclen + server->vals->header_preamble_size -
+ server->total_read;
while (remaining > 0) {
int length;
@@ -1454,7 +1466,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
unsigned int data_offset, data_len;
struct cifs_readdata *rdata = mid->callback_data;
char *buf = server->smallbuf;
- unsigned int buflen = get_rfc1002_length(buf) +
+ unsigned int buflen = server->pdu_size +
server->vals->header_preamble_size;
bool use_rdma_mr = false;
@@ -1940,6 +1952,7 @@ cifs_writedata_release(struct kref *refcount)
if (wdata->cfile)
cifsFileInfo_put(wdata->cfile);
+ kvfree(wdata->pages);
kfree(wdata);
}
@@ -2063,12 +2076,22 @@ cifs_writev_complete(struct work_struct *work)
struct cifs_writedata *
cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
{
+ struct page **pages =
+ kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+ if (pages)
+ return cifs_writedata_direct_alloc(pages, complete);
+
+ return NULL;
+}
+
+struct cifs_writedata *
+cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
+{
struct cifs_writedata *wdata;
- /* writedata + number of page pointers */
- wdata = kzalloc(sizeof(*wdata) +
- sizeof(struct page *) * nr_pages, GFP_NOFS);
+ wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
if (wdata != NULL) {
+ wdata->pages = pages;
kref_init(&wdata->refcount);
INIT_LIST_HEAD(&wdata->list);
init_completion(&wdata->done);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4e0808f40195..e5a2fe7f0dd4 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -61,6 +61,7 @@
#define RFC1001_PORT 139
extern mempool_t *cifs_req_poolp;
+extern bool disable_legacy_dialects;
/* FIXME: should these be tunable? */
#define TLINK_ERROR_EXPIRE (1 * HZ)
@@ -76,9 +77,10 @@ enum {
Opt_mapposix, Opt_nomapposix,
Opt_mapchars, Opt_nomapchars, Opt_sfu,
Opt_nosfu, Opt_nodfs, Opt_posixpaths,
- Opt_noposixpaths, Opt_nounix,
+ Opt_noposixpaths, Opt_nounix, Opt_unix,
Opt_nocase,
Opt_brl, Opt_nobrl,
+ Opt_handlecache, Opt_nohandlecache,
Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids,
Opt_nosetuids, Opt_dynperm, Opt_nodynperm,
Opt_nohard, Opt_nosoft,
@@ -144,10 +146,16 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_noposixpaths, "noposixpaths" },
{ Opt_nounix, "nounix" },
{ Opt_nounix, "nolinux" },
+ { Opt_nounix, "noposix" },
+ { Opt_unix, "unix" },
+ { Opt_unix, "linux" },
+ { Opt_unix, "posix" },
{ Opt_nocase, "nocase" },
{ Opt_nocase, "ignorecase" },
{ Opt_brl, "brl" },
{ Opt_nobrl, "nobrl" },
+ { Opt_handlecache, "handlecache" },
+ { Opt_nohandlecache, "nohandlecache" },
{ Opt_nobrl, "nolock" },
{ Opt_forcemandatorylock, "forcemandatorylock" },
{ Opt_forcemandatorylock, "forcemand" },
@@ -591,10 +599,11 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
- unsigned int to_read)
+ unsigned int page_offset, unsigned int to_read)
{
struct msghdr smb_msg;
- struct bio_vec bv = {.bv_page = page, .bv_len = to_read};
+ struct bio_vec bv = {
+ .bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -772,7 +781,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length;
char *buf = server->smallbuf;
- unsigned int pdu_length = get_rfc1002_length(buf);
+ unsigned int pdu_length = server->pdu_size;
/* make sure this will fit in a large buffer */
if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) -
@@ -848,6 +857,7 @@ cifs_demultiplex_thread(void *p)
int length;
struct TCP_Server_Info *server = p;
unsigned int pdu_length;
+ unsigned int next_offset;
char *buf = NULL;
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mid_entry;
@@ -874,7 +884,11 @@ cifs_demultiplex_thread(void *p)
length = cifs_read_from_socket(server, buf, pdu_length);
if (length < 0)
continue;
- server->total_read = length;
+
+ if (server->vals->header_preamble_size == 0)
+ server->total_read = 0;
+ else
+ server->total_read = length;
/*
* The right amount was read from socket - 4 bytes,
@@ -885,12 +899,14 @@ cifs_demultiplex_thread(void *p)
cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
if (!is_smb_response(server, buf[0]))
continue;
+next_pdu:
+ server->pdu_size = pdu_length;
/* make sure we have enough to get to the MID */
- if (pdu_length < HEADER_SIZE(server) - 1 -
+ if (server->pdu_size < HEADER_SIZE(server) - 1 -
server->vals->header_preamble_size) {
cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
- pdu_length);
+ server->pdu_size);
cifs_reconnect(server);
wake_up(&server->response_q);
continue;
@@ -905,6 +921,12 @@ cifs_demultiplex_thread(void *p)
continue;
server->total_read += length;
+ if (server->ops->next_header) {
+ next_offset = server->ops->next_header(buf);
+ if (next_offset)
+ server->pdu_size = next_offset;
+ }
+
if (server->ops->is_transform_hdr &&
server->ops->receive_transform &&
server->ops->is_transform_hdr(buf)) {
@@ -927,6 +949,7 @@ cifs_demultiplex_thread(void *p)
server->lstrp = jiffies;
if (mid_entry != NULL) {
+ mid_entry->resp_buf_size = server->pdu_size;
if ((mid_entry->mid_flags & MID_WAIT_CANCELLED) &&
mid_entry->mid_state == MID_RESPONSE_RECEIVED &&
server->ops->handle_cancelled_mid)
@@ -946,10 +969,18 @@ cifs_demultiplex_thread(void *p)
HEADER_SIZE(server));
#ifdef CONFIG_CIFS_DEBUG2
if (server->ops->dump_detail)
- server->ops->dump_detail(buf);
+ server->ops->dump_detail(buf, server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
-
+ }
+ if (pdu_length > server->pdu_size) {
+ if (!allocate_buffers(server))
+ continue;
+ pdu_length -= server->pdu_size;
+ server->total_read = 0;
+ server->large_buf = false;
+ buf = server->smallbuf;
+ goto next_pdu;
}
} /* end while !EXITING */
@@ -1141,10 +1172,18 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
switch (match_token(value, cifs_smb_version_tokens, args)) {
case Smb_1:
+ if (disable_legacy_dialects) {
+ cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ return 1;
+ }
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
break;
case Smb_20:
+ if (disable_legacy_dialects) {
+ cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ return 1;
+ }
vol->ops = &smb20_operations;
vol->vals = &smb20_values;
break;
@@ -1424,8 +1463,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->posix_paths = 0;
break;
case Opt_nounix:
+ if (vol->linux_ext)
+ cifs_dbg(VFS,
+ "conflicting unix mount options\n");
vol->no_linux_ext = 1;
break;
+ case Opt_unix:
+ if (vol->no_linux_ext)
+ cifs_dbg(VFS,
+ "conflicting unix mount options\n");
+ vol->linux_ext = 1;
+ break;
case Opt_nocase:
vol->nocase = 1;
break;
@@ -1443,6 +1491,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
(S_IALLUGO & ~(S_ISUID | S_IXGRP)))
vol->file_mode = S_IALLUGO;
break;
+ case Opt_nohandlecache:
+ vol->nohandlecache = 1;
+ break;
+ case Opt_handlecache:
+ vol->nohandlecache = 0;
+ break;
case Opt_forcemandatorylock:
vol->mand_lock = 1;
break;
@@ -1975,14 +2029,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (vol->rdma && vol->sign) {
- cifs_dbg(VFS, "Currently SMB direct doesn't support signing."
- " This is being fixed\n");
- goto cifs_parse_mount_err;
- }
-#endif
-
#ifndef CONFIG_KEYS
/* Muliuser mounts require CONFIG_KEYS support */
if (vol->multiuser) {
@@ -2957,6 +3003,29 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
}
}
+ if (volume_info->seal) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB3 or later required for encryption\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ } else if (tcon->ses->server->capabilities &
+ SMB2_GLOBAL_CAP_ENCRYPTION)
+ tcon->seal = true;
+ else {
+ cifs_dbg(VFS, "Encryption is not supported on share\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ }
+ }
+
+#ifdef CONFIG_CIFS_SMB311
+ if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) {
+ if (ses->server->vals->protocol_id == SMB311_PROT_ID)
+ tcon->posix_extensions = true;
+ }
+#endif /* 311 */
+
/*
* BB Do we need to wrap session_mutex around this TCon call and Unix
* SetFS as we do on SessSetup and reconnect?
@@ -3005,22 +3074,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
tcon->use_resilient = true;
}
- if (volume_info->seal) {
- if (ses->server->vals->protocol_id == 0) {
- cifs_dbg(VFS,
- "SMB3 or later required for encryption\n");
- rc = -EOPNOTSUPP;
- goto out_fail;
- } else if (tcon->ses->server->capabilities &
- SMB2_GLOBAL_CAP_ENCRYPTION)
- tcon->seal = true;
- else {
- cifs_dbg(VFS, "Encryption is not supported on share\n");
- rc = -EOPNOTSUPP;
- goto out_fail;
- }
- }
-
/*
* We can have only one retry value for a connection to a share so for
* resources mounted more than once to the same server share the last
@@ -3028,6 +3081,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
*/
tcon->retry = volume_info->retry;
tcon->nocase = volume_info->nocase;
+ tcon->nohandlecache = volume_info->nohandlecache;
tcon->local_lease = volume_info->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
@@ -3586,6 +3640,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
if (pvolume_info->nobrl)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+ if (pvolume_info->nohandlecache)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE;
if (pvolume_info->nostrictsync)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
if (pvolume_info->mand_lock)
@@ -3928,6 +3984,12 @@ try_mount_again:
goto remote_path_check;
}
+#ifdef CONFIG_CIFS_SMB311
+ /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+ if (tcon->posix_extensions)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#endif /* SMB3.11 */
+
/* tell server which Unix caps we support */
if (cap_unix(tcon->ses)) {
/* reset of caps checks mount to see if unix extensions
@@ -4359,6 +4421,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->UNC = master_tcon->treeName;
vol_info->retry = master_tcon->retry;
vol_info->nocase = master_tcon->nocase;
+ vol_info->nohandlecache = master_tcon->nohandlecache;
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
vol_info->sectype = master_tcon->ses->sectype;
@@ -4388,8 +4451,14 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
+#ifdef CONFIG_CIFS_SMB311
+ /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
+ if (tcon->posix_extensions)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#endif /* SMB3.11 */
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
+
out:
kfree(vol_info->username);
kzfree(vol_info->password);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 81ba6e0d88d8..ddae52bd1993 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -369,7 +369,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
oparms.path = full_path;
oparms.fid = fid;
oparms.reconnect = false;
-
+ oparms.mode = mode;
rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc) {
cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
@@ -684,6 +684,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
}
+ if (!S_ISCHR(mode) && !S_ISBLK(mode))
+ goto mknod_out;
+
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
goto mknod_out;
@@ -692,10 +695,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
if (buf == NULL) {
- kfree(full_path);
rc = -ENOMEM;
- free_xid(xid);
- return rc;
+ goto mknod_out;
}
if (backup_cred(cifs_sb))
@@ -742,7 +743,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
pdev->minor = cpu_to_le64(MINOR(device_number));
rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
&bytes_written, iov, 1);
- } /* else if (S_ISFIFO) */
+ }
tcon->ses->server->ops->close(xid, tcon, &fid);
d_drop(direntry);
@@ -779,21 +780,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
free_xid(xid);
- return (struct dentry *)tlink;
+ return ERR_CAST(tlink);
}
pTcon = tlink_tcon(tlink);
rc = check_name(direntry, pTcon);
- if (rc)
- goto lookup_out;
+ if (unlikely(rc)) {
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+ return ERR_PTR(rc);
+ }
/* can not grab the rename sem here since it would
deadlock in the cases (beginning of sys_rename itself)
in which we already have the sb rename sem */
full_path = build_path_from_dentry(direntry);
if (full_path == NULL) {
- rc = -ENOMEM;
- goto lookup_out;
+ cifs_put_tlink(tlink);
+ free_xid(xid);
+ return ERR_PTR(-ENOMEM);
}
if (d_really_is_positive(direntry)) {
@@ -812,29 +817,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
parent_dir_inode->i_sb, xid, NULL);
}
- if ((rc == 0) && (newInode != NULL)) {
- d_add(direntry, newInode);
+ if (rc == 0) {
/* since paths are not looked up by component - the parent
directories are presumed to be good here */
renew_parental_timestamps(direntry);
-
} else if (rc == -ENOENT) {
- rc = 0;
cifs_set_time(direntry, jiffies);
- d_add(direntry, NULL);
- /* if it was once a directory (but how can we tell?) we could do
- shrink_dcache_parent(direntry); */
- } else if (rc != -EACCES) {
- cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
- /* We special case check for Access Denied - since that
- is a common return code */
+ newInode = NULL;
+ } else {
+ if (rc != -EACCES) {
+ cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
+ /* We special case check for Access Denied - since that
+ is a common return code */
+ }
+ newInode = ERR_PTR(rc);
}
-
-lookup_out:
kfree(full_path);
cifs_put_tlink(tlink);
free_xid(xid);
- return ERR_PTR(rc);
+ return d_splice_alias(newInode, direntry);
}
static int
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7cee97b93a61..87eece6fbd48 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
for (i = 0; i < found_pages; i++) {
page = wdata->pages[i];
/*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
+ * At this point we hold neither the i_pages lock nor the
+ * page lock: the page may be truncated or invalidated
+ * (changing page->mapping to NULL), or even swizzled
+ * back from swapper_space to tmpfs file mapping
*/
if (nr_pages == 0)
@@ -2881,13 +2880,13 @@ out:
}
static struct cifs_readdata *
-cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+cifs_readdata_direct_alloc(struct page **pages, work_func_t complete)
{
struct cifs_readdata *rdata;
- rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages),
- GFP_KERNEL);
+ rdata = kzalloc(sizeof(*rdata), GFP_KERNEL);
if (rdata != NULL) {
+ rdata->pages = pages;
kref_init(&rdata->refcount);
INIT_LIST_HEAD(&rdata->list);
init_completion(&rdata->done);
@@ -2897,6 +2896,22 @@ cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
return rdata;
}
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+ struct page **pages =
+ kzalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+ struct cifs_readdata *ret = NULL;
+
+ if (pages) {
+ ret = cifs_readdata_direct_alloc(pages, complete);
+ if (!ret)
+ kfree(pages);
+ }
+
+ return ret;
+}
+
void
cifs_readdata_release(struct kref *refcount)
{
@@ -2911,6 +2926,7 @@ cifs_readdata_release(struct kref *refcount)
if (rdata->cfile)
cifsFileInfo_put(rdata->cfile);
+ kvfree(rdata->pages);
kfree(rdata);
}
@@ -3010,12 +3026,20 @@ uncached_fill_pages(struct TCP_Server_Info *server,
int result = 0;
unsigned int i;
unsigned int nr_pages = rdata->nr_pages;
+ unsigned int page_offset = rdata->page_offset;
rdata->got_bytes = 0;
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
size_t n;
+ unsigned int segment_size = rdata->pagesz;
+
+ if (i == 0)
+ segment_size -= page_offset;
+ else
+ page_offset = 0;
+
if (len <= 0) {
/* no need to hold page hostage */
@@ -3024,24 +3048,25 @@ uncached_fill_pages(struct TCP_Server_Info *server,
put_page(page);
continue;
}
+
n = len;
- if (len >= PAGE_SIZE) {
+ if (len >= segment_size)
/* enough data to fill the page */
- n = PAGE_SIZE;
- len -= n;
- } else {
- zero_user(page, len, PAGE_SIZE - len);
+ n = segment_size;
+ else
rdata->tailsz = len;
- len = 0;
- }
+ len -= n;
+
if (iter)
- result = copy_page_from_iter(page, 0, n, iter);
+ result = copy_page_from_iter(
+ page, page_offset, n, iter);
#ifdef CONFIG_CIFS_SMB_DIRECT
else if (rdata->mr)
result = n;
#endif
else
- result = cifs_read_page_from_socket(server, page, n);
+ result = cifs_read_page_from_socket(
+ server, page, page_offset, n);
if (result < 0)
break;
@@ -3114,6 +3139,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->bytes = cur_len;
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
rdata->credits = credits;
@@ -3463,7 +3489,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
* If the page is mmap'ed into a process' page tables, then we need to make
* sure that it doesn't change while being written back.
*/
-static int
+static vm_fault_t
cifs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
@@ -3558,6 +3584,7 @@ readpages_fill_pages(struct TCP_Server_Info *server,
u64 eof;
pgoff_t eof_index;
unsigned int nr_pages = rdata->nr_pages;
+ unsigned int page_offset = rdata->page_offset;
/* determine the eof that the server (probably) has */
eof = CIFS_I(rdata->mapping->host)->server_eof;
@@ -3568,13 +3595,21 @@ readpages_fill_pages(struct TCP_Server_Info *server,
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
- size_t n = PAGE_SIZE;
+ unsigned int to_read = rdata->pagesz;
+ size_t n;
+
+ if (i == 0)
+ to_read -= page_offset;
+ else
+ page_offset = 0;
+
+ n = to_read;
- if (len >= PAGE_SIZE) {
- len -= PAGE_SIZE;
+ if (len >= to_read) {
+ len -= to_read;
} else if (len > 0) {
/* enough for partial page, fill and zero the rest */
- zero_user(page, len, PAGE_SIZE - len);
+ zero_user(page, len + page_offset, to_read - len);
n = rdata->tailsz = len;
len = 0;
} else if (page->index > eof_index) {
@@ -3606,13 +3641,15 @@ readpages_fill_pages(struct TCP_Server_Info *server,
}
if (iter)
- result = copy_page_from_iter(page, 0, n, iter);
+ result = copy_page_from_iter(
+ page, page_offset, n, iter);
#ifdef CONFIG_CIFS_SMB_DIRECT
else if (rdata->mr)
result = n;
#endif
else
- result = cifs_read_page_from_socket(server, page, n);
+ result = cifs_read_page_from_socket(
+ server, page, page_offset, n);
if (result < 0)
break;
@@ -3791,6 +3828,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->bytes = bytes;
rdata->pid = pid;
rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
rdata->credits = credits;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 8d4b7bc8ae91..25d3f66b2d50 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -23,11 +23,63 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+ struct {
+ uint16_t family; /* address family */
+ __be16 port; /* IP port */
+ } hdr;
+ union {
+ struct in_addr ipv4_addr;
+ struct in6_addr ipv6_addr;
+ };
+} __packed;
+
+/*
+ * Get a cookie for a server object keyed by {IPaddress,port,family} tuple
+ */
void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
{
+ const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
+ const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
+ const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
+ struct cifs_server_key key;
+ uint16_t key_len = sizeof(key.hdr);
+
+ memset(&key, 0, sizeof(key));
+
+ /*
+ * Should not be a problem as sin_family/sin6_family overlays
+ * sa_family field
+ */
+ key.hdr.family = sa->sa_family;
+ switch (sa->sa_family) {
+ case AF_INET:
+ key.hdr.port = addr->sin_port;
+ key.ipv4_addr = addr->sin_addr;
+ key_len += sizeof(key.ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key.hdr.port = addr6->sin6_port;
+ key.ipv6_addr = addr6->sin6_addr;
+ key_len += sizeof(key.ipv6_addr);
+ break;
+
+ default:
+ cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
+ server->fscache = NULL;
+ return;
+ }
+
server->fscache =
fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
- &cifs_fscache_server_index_def, server, true);
+ &cifs_fscache_server_index_def,
+ &key, key_len,
+ NULL, 0,
+ server, 0, true);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server, server->fscache);
}
@@ -36,17 +88,29 @@ void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
{
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server, server->fscache);
- fscache_relinquish_cookie(server->fscache, 0);
+ fscache_relinquish_cookie(server->fscache, NULL, false);
server->fscache = NULL;
}
void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
{
struct TCP_Server_Info *server = tcon->ses->server;
+ char *sharename;
+
+ sharename = extract_sharename(tcon->treeName);
+ if (IS_ERR(sharename)) {
+ cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
+ tcon->fscache = NULL;
+ return;
+ }
tcon->fscache =
fscache_acquire_cookie(server->fscache,
- &cifs_fscache_super_index_def, tcon, true);
+ &cifs_fscache_super_index_def,
+ sharename, strlen(sharename),
+ &tcon->resource_id, sizeof(tcon->resource_id),
+ tcon, 0, true);
+ kfree(sharename);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server->fscache, tcon->fscache);
}
@@ -54,10 +118,28 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
{
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
- fscache_relinquish_cookie(tcon->fscache, 0);
+ fscache_relinquish_cookie(tcon->fscache, &tcon->resource_id, false);
tcon->fscache = NULL;
}
+static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi,
+ struct cifs_tcon *tcon)
+{
+ struct cifs_fscache_inode_auxdata auxdata;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+ auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
+ cifsi->fscache =
+ fscache_acquire_cookie(tcon->fscache,
+ &cifs_fscache_inode_object_def,
+ &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+ &auxdata, sizeof(auxdata),
+ cifsi, cifsi->vfs_inode.i_size, true);
+}
+
static void cifs_fscache_enable_inode_cookie(struct inode *inode)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
@@ -67,21 +149,28 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
if (cifsi->fscache)
return;
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
- cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
- &cifs_fscache_inode_object_def, cifsi, true);
- cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
- __func__, tcon->fscache, cifsi->fscache);
- }
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE))
+ return;
+
+ cifs_fscache_acquire_inode_cookie(cifsi, tcon);
+
+ cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
+ __func__, tcon->fscache, cifsi->fscache);
}
void cifs_fscache_release_inode_cookie(struct inode *inode)
{
+ struct cifs_fscache_inode_auxdata auxdata;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
if (cifsi->fscache) {
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.eof = cifsi->server_eof;
+ auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+ auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
- fscache_relinquish_cookie(cifsi->fscache, 0);
+ fscache_relinquish_cookie(cifsi->fscache, &auxdata, false);
cifsi->fscache = NULL;
}
}
@@ -93,7 +182,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
if (cifsi->fscache) {
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
fscache_uncache_all_inode_pages(cifsi->fscache, inode);
- fscache_relinquish_cookie(cifsi->fscache, 1);
+ fscache_relinquish_cookie(cifsi->fscache, NULL, true);
cifsi->fscache = NULL;
}
}
@@ -110,16 +199,14 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct fscache_cookie *old = cifsi->fscache;
if (cifsi->fscache) {
/* retire the current fscache cache and get a new one */
- fscache_relinquish_cookie(cifsi->fscache, 1);
+ fscache_relinquish_cookie(cifsi->fscache, NULL, true);
- cifsi->fscache = fscache_acquire_cookie(
- cifs_sb_master_tcon(cifs_sb)->fscache,
- &cifs_fscache_inode_object_def,
- cifsi, true);
+ cifs_fscache_acquire_inode_cookie(cifsi, tcon);
cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
__func__, cifsi->fscache, old);
}
@@ -214,13 +301,15 @@ int __cifs_readpages_from_fscache(struct inode *inode,
void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
{
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
int ret;
cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
- __func__, CIFS_I(inode)->fscache, page, inode);
- ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+ __func__, cifsi->fscache, page, inode);
+ ret = fscache_write_page(cifsi->fscache, page,
+ cifsi->vfs_inode.i_size, GFP_KERNEL);
if (ret != 0)
- fscache_uncache_page(CIFS_I(inode)->fscache, page);
+ fscache_uncache_page(cifsi->fscache, page);
}
void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages)
@@ -239,4 +328,3 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
fscache_wait_on_page_write(cookie, page);
fscache_uncache_page(cookie, page);
}
-
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 24794b6cd8ec..c7e3ac251e16 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -27,6 +27,18 @@
#ifdef CONFIG_CIFS_FSCACHE
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+ struct timespec last_write_time;
+ struct timespec last_change_time;
+ u64 eof;
+};
+
+/*
+ * cache.c
+ */
extern struct fscache_netfs cifs_fscache_netfs;
extern const struct fscache_cookie_def cifs_fscache_server_index_def;
extern const struct fscache_cookie_def cifs_fscache_super_index_def;
@@ -34,6 +46,7 @@ extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
extern int cifs_fscache_register(void);
extern void cifs_fscache_unregister(void);
+extern char *extract_sharename(const char *);
/*
* fscache.c
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f856df4adae3..745fd7fe8d0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -710,7 +710,7 @@ cgfi_exit:
/* Simple function to return a 64 bit hash of string. Rarely called */
static __u64 simple_hashstr(const char *str)
{
- const __u64 hash_mult = 1125899906842597L; /* a big enough prime */
+ const __u64 hash_mult = 1125899906842597ULL; /* a big enough prime */
__u64 hash = 0;
while (*str)
@@ -746,7 +746,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_dbg(FYI, "Getting info on %s\n", full_path);
if ((data == NULL) && (*inode != NULL)) {
- if (CIFS_CACHE_READ(CIFS_I(*inode))) {
+ if (CIFS_CACHE_READ(CIFS_I(*inode)) &&
+ CIFS_I(*inode)->time != 0) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
goto cgii_exit;
}
@@ -1857,15 +1858,15 @@ cifs_inode_needs_reval(struct inode *inode)
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ if (cifs_i->time == 0)
+ return true;
+
if (CIFS_CACHE_READ(cifs_i))
return false;
if (!lookupCacheEnabled)
return true;
- if (cifs_i->time == 0)
- return true;
-
if (!cifs_sb->actimeo)
return true;
@@ -2104,10 +2105,14 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
static void cifs_setsize(struct inode *inode, loff_t offset)
{
+ struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
spin_lock(&inode->i_lock);
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
+ /* Cached inode must be refreshed on truncate */
+ cifs_i->time = 0;
truncate_pagecache(inode, offset);
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 460084a8eac5..aba3fc3058da 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -117,6 +117,8 @@ tconInfoAlloc(void)
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
spin_lock_init(&ret_buf->open_file_lock);
+ mutex_init(&ret_buf->prfid_mutex);
+ ret_buf->prfid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL);
#ifdef CONFIG_CIFS_STATS
spin_lock_init(&ret_buf->stat_lock);
#endif
@@ -134,6 +136,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free)
atomic_dec(&tconInfoAllocCount);
kfree(buf_to_free->nativeFileSystem);
kzfree(buf_to_free->password);
+ kfree(buf_to_free->prfid);
kfree(buf_to_free);
}
@@ -145,7 +148,7 @@ cifs_buf_get(void)
* SMB2 header is bigger than CIFS one - no problems to clean some
* more bytes for CIFS.
*/
- size_t buf_size = sizeof(struct smb2_hdr);
+ size_t buf_size = sizeof(struct smb2_sync_hdr);
/*
* We could use negotiated size instead of max_msgsize -
@@ -339,7 +342,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
/* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb))
return -EIO;
- clc_len = smbCalcSize(smb);
+ clc_len = smbCalcSize(smb, server);
if (4 + rfclen != total_read) {
cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index cc88f4f0325e..d7ad0dfe4e68 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -903,7 +903,7 @@ map_smb_to_linux_error(char *buf, bool logErr)
* portion, the number of word parameters and the data portion of the message
*/
unsigned int
-smbCalcSize(void *buf)
+smbCalcSize(void *buf, struct TCP_Server_Info *server)
{
struct smb_hdr *ptr = (struct smb_hdr *)buf;
return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a27fc8791551..eeab81c9452f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -650,7 +650,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
char *cur_ent;
char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
server->ops->calc_smb_size(
- cfile->srch_inf.ntwrk_buf_start);
+ cfile->srch_inf.ntwrk_buf_start,
+ server);
cur_ent = cfile->srch_inf.srch_entries_start;
first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
@@ -831,7 +832,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
max_len = tcon->ses->server->ops->calc_smb_size(
- cifsFile->srch_inf.ntwrk_buf_start);
+ cifsFile->srch_inf.ntwrk_buf_start,
+ tcon->ses->server);
end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 401a5d856636..0ffa18094335 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -61,9 +61,4 @@
/* Maximum buffer size value we can send with 1 credit */
#define SMB2_MAX_BUFFER_SIZE 65536
-static inline struct smb2_sync_hdr *get_sync_hdr(void *buf)
-{
- return &(((struct smb2_hdr *)buf)->sync_hdr);
-}
-
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1238cd3552f9..a6e786e39248 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -44,26 +44,38 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
__u32 create_options, void *data, int command)
{
int rc, tmprc = 0;
- __le16 *utf16_path;
+ __le16 *utf16_path = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ bool use_cached_root_handle = false;
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
+ if ((strcmp(full_path, "") == 0) && (create_options == 0) &&
+ (desired_access == FILE_READ_ATTRIBUTES) &&
+ (create_disposition == FILE_OPEN) &&
+ (tcon->nohandlecache == false)) {
+ rc = open_shroot(xid, tcon, &fid);
+ if (rc == 0)
+ use_cached_root_handle = true;
+ }
+
+ if (use_cached_root_handle == false) {
+ utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
- oparms.tcon = tcon;
- oparms.desired_access = desired_access;
- oparms.disposition = create_disposition;
- oparms.create_options = create_options;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms.tcon = tcon;
+ oparms.desired_access = desired_access;
+ oparms.disposition = create_disposition;
+ oparms.create_options = create_options;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
- if (rc) {
- kfree(utf16_path);
- return rc;
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ if (rc) {
+ kfree(utf16_path);
+ return rc;
+ }
}
switch (command) {
@@ -107,7 +119,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
break;
}
- rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ if (use_cached_root_handle == false)
+ rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
if (tmprc)
rc = tmprc;
kfree(utf16_path);
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 3bfc9c990724..20a2d304c603 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -27,6 +27,7 @@
#include "smb2proto.h"
#include "smb2status.h"
#include "smb2glob.h"
+#include "trace.h"
struct status_to_posix_error {
__le32 smb2_status;
@@ -2450,13 +2451,16 @@ smb2_print_status(__le32 status)
int
map_smb2_to_linux_error(char *buf, bool log_err)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
unsigned int i;
int rc = -EIO;
__le32 smb2err = shdr->Status;
- if (smb2err == 0)
+ if (smb2err == 0) {
+ trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
return 0;
+ }
/* mask facility */
if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) &&
@@ -2478,5 +2482,8 @@ map_smb2_to_linux_error(char *buf, bool log_err)
cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
__le32_to_cpu(smb2err), rc);
+ trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
+ le16_to_cpu(shdr->Command),
+ le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
return rc;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 5406e95f5d92..cb5728e3d87d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -93,26 +93,57 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
};
+#ifdef CONFIG_CIFS_SMB311
+static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+ __u32 non_ctxlen)
+{
+ __u16 neg_count;
+ __u32 nc_offset, size_of_pad_before_neg_ctxts;
+ struct smb2_negotiate_rsp *pneg_rsp = (struct smb2_negotiate_rsp *)hdr;
+
+ /* Negotiate contexts are only valid for latest dialect SMB3.11 */
+ neg_count = le16_to_cpu(pneg_rsp->NegotiateContextCount);
+ if ((neg_count == 0) ||
+ (pneg_rsp->DialectRevision != cpu_to_le16(SMB311_PROT_ID)))
+ return 0;
+
+ /* Make sure that negotiate contexts start after gss security blob */
+ nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset);
+ if (nc_offset < non_ctxlen) {
+ printk_once(KERN_WARNING "invalid negotiate context offset\n");
+ return 0;
+ }
+ size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen;
+
+ /* Verify that at least minimal negotiate contexts fit within frame */
+ if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) {
+ printk_once(KERN_WARNING "negotiate context goes beyond end\n");
+ return 0;
+ }
+
+ cifs_dbg(FYI, "length of negcontexts %d pad %d\n",
+ len - nc_offset, size_of_pad_before_neg_ctxts);
+
+ /* length of negcontexts including pad from end of sec blob to them */
+ return (len - nc_offset) + size_of_pad_before_neg_ctxts;
+}
+#endif /* CIFS_SMB311 */
+
int
-smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
+smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
{
- struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
- struct smb2_hdr *hdr = &pdu->hdr;
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
__u64 mid;
- __u32 len = get_rfc1002_length(buf);
__u32 clc_len; /* calculated length */
int command;
-
- /* BB disable following printk later */
- cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
- __func__, length, len);
+ int pdu_size = sizeof(struct smb2_sync_pdu);
+ int hdr_size = sizeof(struct smb2_sync_hdr);
/*
* Add function to do table lookup of StructureSize by command
* ie Validate the wct via smb2_struct_sizes table above
*/
-
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
@@ -136,8 +167,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
mid = le64_to_cpu(shdr->MessageId);
- if (length < sizeof(struct smb2_pdu)) {
- if ((length >= sizeof(struct smb2_hdr))
+ if (len < pdu_size) {
+ if ((len >= hdr_size)
&& (shdr->Status != 0)) {
pdu->StructureSize2 = 0;
/*
@@ -150,8 +181,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
return 1;
}
- if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE -
- srvr->vals->header_preamble_size) {
+ if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE) {
cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
mid);
return 1;
@@ -190,39 +220,38 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
}
}
- if (srvr->vals->header_preamble_size + len != length) {
- cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n",
- length, srvr->vals->header_preamble_size + len, mid);
- return 1;
- }
-
- clc_len = smb2_calc_size(hdr);
+ clc_len = smb2_calc_size(buf, srvr);
- if (srvr->vals->header_preamble_size + len != clc_len) {
- cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n",
- clc_len, srvr->vals->header_preamble_size + len, mid);
+#ifdef CONFIG_CIFS_SMB311
+ if (shdr->Command == SMB2_NEGOTIATE)
+ clc_len += get_neg_ctxt_len(shdr, len, clc_len);
+#endif /* SMB311 */
+ if (len != clc_len) {
+ cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+ clc_len, len, mid);
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
shdr->Status == STATUS_STOPPED_ON_SYMLINK)
return 0;
/* Windows 7 server returns 24 bytes more */
- if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE)
+ if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
/* server can return one byte more due to implied bcc[0] */
- if (clc_len == srvr->vals->header_preamble_size + len + 1)
+ if (clc_len == len + 1)
return 0;
/*
* MacOS server pads after SMB2.1 write response with 3 bytes
* of junk. Other servers match RFC1001 len to actual
* SMB2/SMB3 frame length (header + smb2 response specific data)
+ * Some windows servers do too when compounding is used.
* Log the server error (once), but allow it and continue
* since the frame is parseable.
*/
- if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) {
+ if (clc_len < len) {
printk_once(KERN_WARNING
- "SMB2 server sent bad RFC1001 len %d not %zu\n",
- len, clc_len - srvr->vals->header_preamble_size);
+ "SMB2 server sent bad RFC1001 len %d not %d\n",
+ len, clc_len);
return 0;
}
@@ -263,15 +292,14 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
* area and the offset to it (from the beginning of the smb are also returned.
*/
char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
*off = 0;
*len = 0;
/* error responses do not have data area */
if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
- (((struct smb2_err_rsp *)hdr)->StructureSize) ==
+ (((struct smb2_err_rsp *)shdr)->StructureSize) ==
SMB2_ERROR_STRUCTURE_SIZE2)
return NULL;
@@ -283,42 +311,44 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
switch (shdr->Command) {
case SMB2_NEGOTIATE:
*off = le16_to_cpu(
- ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset);
+ ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferOffset);
*len = le16_to_cpu(
- ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength);
+ ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferLength);
break;
case SMB2_SESSION_SETUP:
*off = le16_to_cpu(
- ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset);
+ ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferOffset);
*len = le16_to_cpu(
- ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength);
+ ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferLength);
break;
case SMB2_CREATE:
*off = le32_to_cpu(
- ((struct smb2_create_rsp *)hdr)->CreateContextsOffset);
+ ((struct smb2_create_rsp *)shdr)->CreateContextsOffset);
*len = le32_to_cpu(
- ((struct smb2_create_rsp *)hdr)->CreateContextsLength);
+ ((struct smb2_create_rsp *)shdr)->CreateContextsLength);
break;
case SMB2_QUERY_INFO:
*off = le16_to_cpu(
- ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset);
+ ((struct smb2_query_info_rsp *)shdr)->OutputBufferOffset);
*len = le32_to_cpu(
- ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength);
+ ((struct smb2_query_info_rsp *)shdr)->OutputBufferLength);
break;
case SMB2_READ:
- *off = ((struct smb2_read_rsp *)hdr)->DataOffset;
- *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength);
+ /* TODO: is this a bug ? */
+ *off = ((struct smb2_read_rsp *)shdr)->DataOffset;
+ *len = le32_to_cpu(((struct smb2_read_rsp *)shdr)->DataLength);
break;
case SMB2_QUERY_DIRECTORY:
*off = le16_to_cpu(
- ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset);
+ ((struct smb2_query_directory_rsp *)shdr)->OutputBufferOffset);
*len = le32_to_cpu(
- ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
+ ((struct smb2_query_directory_rsp *)shdr)->OutputBufferLength);
break;
case SMB2_IOCTL:
*off = le32_to_cpu(
- ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
- *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+ ((struct smb2_ioctl_rsp *)shdr)->OutputOffset);
+ *len = le32_to_cpu(
+ ((struct smb2_ioctl_rsp *)shdr)->OutputCount);
break;
case SMB2_CHANGE_NOTIFY:
default:
@@ -361,15 +391,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
* portion, the number of word parameters and the data portion of the message.
*/
unsigned int
-smb2_calc_size(void *buf)
+smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
{
- struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
- struct smb2_hdr *hdr = &pdu->hdr;
- struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
+ struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
+ struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
/* Structure Size has already been checked to make sure it is 64 */
- int len = 4 + le16_to_cpu(shdr->StructureSize);
+ int len = le16_to_cpu(shdr->StructureSize);
/*
* StructureSize2, ie length of fixed parameter area has already
@@ -380,7 +409,7 @@ smb2_calc_size(void *buf)
if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false)
goto calc_size_exit;
- smb2_get_data_area_len(&offset, &data_length, hdr);
+ smb2_get_data_area_len(&offset, &data_length, shdr);
cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
if (data_length > 0) {
@@ -388,15 +417,14 @@ smb2_calc_size(void *buf)
* Check to make sure that data area begins after fixed area,
* Note that last byte of the fixed area is part of data area
* for some commands, typically those with odd StructureSize,
- * so we must add one to the calculation (and 4 to account for
- * the size of the RFC1001 hdr.
+ * so we must add one to the calculation.
*/
- if (offset + 4 + 1 < len) {
+ if (offset + 1 < len) {
cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
- offset + 4 + 1, len);
+ offset + 1, len);
data_length = 0;
} else {
- len = 4 + offset + data_length;
+ len = offset + data_length;
}
}
calc_size_exit:
@@ -423,8 +451,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
/* Windows doesn't allow paths beginning with \ */
if (from[0] == '\\')
start_of_path = from + 1;
+#ifdef CONFIG_CIFS_SMB311
+ /* SMB311 POSIX extensions paths do not include leading slash */
+ else if (cifs_sb_master_tcon(cifs_sb)->posix_extensions)
+ start_of_path = from + 1;
+#endif /* 311 */
else
start_of_path = from;
+
to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
cifs_sb->local_nls, map_type);
return to;
@@ -579,7 +613,7 @@ smb2_is_valid_lease_break(char *buffer)
bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer;
+ struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -588,7 +622,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "Checking for oplock break\n");
- if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK)
+ if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
return false;
if (rsp->StructureSize !=
@@ -679,7 +713,7 @@ smb2_cancelled_close_fid(struct work_struct *work)
int
smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
{
- struct smb2_sync_hdr *sync_hdr = get_sync_hdr(buffer);
+ struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
struct cifs_tcon *tcon;
struct close_cancelled_open *cancelled;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 968b1d43a1ea..950d0ab2cc61 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -123,7 +123,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
static unsigned int
smb2_get_credits(struct mid_q_entry *mid)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
return le16_to_cpu(shdr->CreditRequest);
}
@@ -190,7 +190,7 @@ static struct mid_q_entry *
smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
struct mid_q_entry *mid;
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
__u64 wire_mid = le64_to_cpu(shdr->MessageId);
if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -212,15 +212,16 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
}
static void
-smb2_dump_detail(void *buf)
+smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
shdr->ProcessId);
- cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf));
+ cifs_dbg(VFS, "smb buf %p len %u\n", buf,
+ server->ops->calc_smb_size(buf, server));
#endif
}
@@ -252,9 +253,14 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->rdma)
- wsize = min_t(unsigned int,
+ if (server->rdma) {
+ if (server->sign)
+ wsize = min_t(unsigned int,
+ wsize, server->smbd_conn->max_fragmented_send_size);
+ else
+ wsize = min_t(unsigned int,
wsize, server->smbd_conn->max_readwrite_size);
+ }
#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
@@ -272,9 +278,14 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->rdma)
- rsize = min_t(unsigned int,
+ if (server->rdma) {
+ if (server->sign)
+ rsize = min_t(unsigned int,
+ rsize, server->smbd_conn->max_fragmented_recv_size);
+ else
+ rsize = min_t(unsigned int,
rsize, server->smbd_conn->max_readwrite_size);
+ }
#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
@@ -312,6 +323,40 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
}
#endif /* STATS2 */
+/*
+ * Open the directory at the root of a share
+ */
+int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
+{
+ struct cifs_open_parms oparams;
+ int rc;
+ __le16 srch_path = 0; /* Null - since an open of top of share */
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+
+ mutex_lock(&tcon->prfid_mutex);
+ if (tcon->valid_root_fid) {
+ cifs_dbg(FYI, "found a cached root file handle\n");
+ memcpy(pfid, tcon->prfid, sizeof(struct cifs_fid));
+ mutex_unlock(&tcon->prfid_mutex);
+ return 0;
+ }
+
+ oparams.tcon = tcon;
+ oparams.create_options = 0;
+ oparams.desired_access = FILE_READ_ATTRIBUTES;
+ oparams.disposition = FILE_OPEN;
+ oparams.fid = pfid;
+ oparams.reconnect = false;
+
+ rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL);
+ if (rc == 0) {
+ memcpy(tcon->prfid, pfid, sizeof(struct cifs_fid));
+ tcon->valid_root_fid = true;
+ }
+ mutex_unlock(&tcon->prfid_mutex);
+ return rc;
+}
+
static void
smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
{
@@ -320,6 +365,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ bool no_cached_open = tcon->nohandlecache;
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
@@ -328,7 +374,11 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+ if (no_cached_open)
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+ else
+ rc = open_shroot(xid, tcon, &fid);
+
if (rc)
return;
@@ -342,7 +392,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
FS_DEVICE_INFORMATION);
SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ if (no_cached_open)
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
return;
}
@@ -384,6 +435,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ if ((*full_path == 0) && tcon->valid_root_fid)
+ return 0;
+
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
@@ -579,9 +633,15 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ /*
+ * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's
+ * not an error. Otherwise, the specified ea_name was not found.
+ */
if (!rc)
rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data,
SMB2_MAX_EA_BUF, ea_name);
+ else if (!ea_name && rc == -ENODATA)
+ rc = 0;
kfree(smb2_data);
return rc;
@@ -688,9 +748,11 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " TRIM-support,");
seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+ seq_printf(m, "\n\ttid: 0x%x", tcon->tid);
if (tcon->perf_sector_size)
seq_printf(m, "\tOptimal sector size: 0x%x",
tcon->perf_sector_size);
+ seq_printf(m, "\tMaximal Access: 0x%x", tcon->maximal_access);
}
static void
@@ -1241,7 +1303,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
static bool
smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
if (shdr->Status != STATUS_PENDING)
return false;
@@ -1259,12 +1321,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
static bool
smb2_is_session_expired(char *buf)
{
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
- if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED)
+ if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
+ shdr->Status != STATUS_USER_SESSION_DELETED)
return false;
- cifs_dbg(FYI, "Session expired\n");
+ cifs_dbg(FYI, "Session expired or deleted\n");
return true;
}
@@ -1451,14 +1514,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
- struct smb2_err_rsp *err_buf = NULL;
+ struct kvec err_iov = {NULL, 0};
+ struct smb2_err_rsp *err_buf;
struct smb2_symlink_err_rsp *symlink;
unsigned int sub_len;
unsigned int sub_offset;
unsigned int print_len;
unsigned int print_offset;
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -1473,15 +1535,16 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov);
- if (!rc || !err_buf) {
+ if (!rc || !err_iov.iov_base) {
kfree(utf16_path);
return -ENOENT;
}
+ err_buf = err_iov.iov_base;
if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
- get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) {
+ err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
kfree(utf16_path);
return -ENOENT;
}
@@ -1494,14 +1557,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
print_len = le16_to_cpu(symlink->PrintNameLength);
print_offset = le16_to_cpu(symlink->PrintNameOffset);
- if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size <
- SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
+ if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
kfree(utf16_path);
return -ENOENT;
}
- if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size <
- SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
+ if (err_iov.iov_len <
+ SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
kfree(utf16_path);
return -ENOENT;
}
@@ -1575,8 +1637,11 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
oparms.create_options = 0;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
- if (!utf16_path)
- return ERR_PTR(-ENOMEM);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ free_xid(xid);
+ return ERR_PTR(rc);
+ }
oparms.tcon = tcon;
oparms.desired_access = READ_CONTROL;
@@ -1634,8 +1699,11 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
access_flags = WRITE_DAC;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ free_xid(xid);
+ return rc;
+ }
oparms.tcon = tcon;
oparms.desired_access = access_flags;
@@ -1695,15 +1763,21 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
- if (keep_size == false)
- return -EOPNOTSUPP;
+ if (keep_size == false) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
/*
* Must check if file sparse since fallocate -z (zero range) assumes
* non-sparse allocation
*/
- if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
- return -EOPNOTSUPP;
+ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
/*
* need to make sure we are not asked to extend the file since the SMB3
@@ -1712,8 +1786,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
* which for a non sparse file would zero the newly extended range
*/
if (keep_size == false)
- if (i_size_read(inode) < offset + len)
- return -EOPNOTSUPP;
+ if (i_size_read(inode) < offset + len) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
@@ -1746,8 +1823,11 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
- if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
- return -EOPNOTSUPP;
+ if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
cifs_dbg(FYI, "offset %lld len %lld", offset, len);
@@ -1778,8 +1858,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
- if (keep_size == false)
- return -EOPNOTSUPP;
+ if (keep_size == false) {
+ free_xid(xid);
+ return rc;
+ }
/*
* Files are non-sparse by default so falloc may be a no-op
@@ -1788,14 +1870,16 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) {
if (keep_size == true)
- return 0;
+ rc = 0;
/* check if extending file */
else if (i_size_read(inode) >= off + len)
/* not extending file and already not sparse */
- return 0;
+ rc = 0;
/* BB: in future add else clause to extend file */
else
- return -EOPNOTSUPP;
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
}
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
@@ -1807,8 +1891,11 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
* ie potentially making a few extra pages at the beginning
* or end of the file non-sparse via set_sparse is harmless.
*/
- if ((off > 8192) || (off + len + 8192 < i_size_read(inode)))
- return -EOPNOTSUPP;
+ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
rc = smb2_set_sparse(xid, tcon, cfile, inode, false);
}
@@ -2017,7 +2104,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
}
static __u8
-smb2_parse_lease_buf(void *buf, unsigned int *epoch)
+smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
{
struct create_lease *lc = (struct create_lease *)buf;
@@ -2028,13 +2115,16 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch)
}
static __u8
-smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
{
struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
*epoch = le16_to_cpu(lc->lcontext.Epoch);
if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
return SMB2_OPLOCK_LEVEL_NOCHANGE;
+ if (lease_key)
+ memcpy(lease_key, &lc->lcontext.LeaseKeyLow,
+ SMB2_LEASE_KEY_SIZE);
return le32_to_cpu(lc->lcontext.LeaseState);
}
@@ -2052,12 +2142,11 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile)
}
static void
-fill_transform_hdr(struct TCP_Server_Info *server,
- struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
+fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
+ struct smb_rqst *old_rq)
{
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base;
- unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -2065,8 +2154,6 @@ fill_transform_hdr(struct TCP_Server_Info *server,
tr_hdr->Flags = cpu_to_le16(0x01);
get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
- inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size);
- inc_rfc1001_len(tr_hdr, orig_len);
}
/* We can not use the normal sg_set_buf() as we will sometimes pass a
@@ -2078,11 +2165,16 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
+/* Assumes:
+ * rqst->rq_iov[0] is rfc1002 length
+ * rqst->rq_iov[1] is tranform header
+ * rqst->rq_iov[2+] data to be encrypted/decrypted
+ */
static struct scatterlist *
init_sg(struct smb_rqst *rqst, u8 *sign)
{
- unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+ unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
struct scatterlist *sg;
unsigned int i;
unsigned int j;
@@ -2092,10 +2184,10 @@ init_sg(struct smb_rqst *rqst, u8 *sign)
return NULL;
sg_init_table(sg, sg_len);
- smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len);
- for (i = 1; i < rqst->rq_nvec; i++)
- smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base,
- rqst->rq_iov[i].iov_len);
+ smb2_sg_set_buf(&sg[0], rqst->rq_iov[1].iov_base + 20, assoc_data_len);
+ for (i = 1; i < rqst->rq_nvec - 1; i++)
+ smb2_sg_set_buf(&sg[i], rqst->rq_iov[i+1].iov_base,
+ rqst->rq_iov[i+1].iov_len);
for (j = 0; i < sg_len - 1; i++, j++) {
unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz
: rqst->rq_tailsz;
@@ -2127,9 +2219,10 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
}
/*
* Encrypt or decrypt @rqst message. @rqst has the following format:
- * iov[0] - transform header (associate data),
- * iov[1-N] and pages - data to encrypt.
- * On success return encrypted data in iov[1-N] and pages, leave iov[0]
+ * iov[0] - rfc1002 length
+ * iov[1] - transform header (associate data),
+ * iov[2-N] and pages - data to encrypt.
+ * On success return encrypted data in iov[2-N] and pages, leave iov[0-1]
* untouched.
*/
static int
@@ -2137,7 +2230,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
{
struct smb2_transform_hdr *tr_hdr =
(struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base;
- unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size;
+ unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int rc = 0;
struct scatterlist *sg;
u8 sign[SMB2_SIGNATURE_SIZE] = {};
@@ -2224,6 +2317,10 @@ free_req:
return rc;
}
+/*
+ * This is called from smb_send_rqst. At this point we have the rfc1002
+ * header as the first element in the vector.
+ */
static int
smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
struct smb_rqst *old_rq)
@@ -2232,6 +2329,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
struct page **pages;
struct smb2_transform_hdr *tr_hdr;
unsigned int npages = old_rq->rq_npages;
+ unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
int i;
int rc = -ENOMEM;
@@ -2250,24 +2348,34 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
goto err_free_pages;
}
- iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL);
+ /* Make space for one extra iov to hold the transform header */
+ iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec),
+ GFP_KERNEL);
if (!iov)
goto err_free_pages;
/* copy all iovs from the old except the 1st one (rfc1002 length) */
- memcpy(&iov[1], &old_rq->rq_iov[1],
+ memcpy(&iov[2], &old_rq->rq_iov[1],
sizeof(struct kvec) * (old_rq->rq_nvec - 1));
+ /* copy the rfc1002 iov */
+ iov[0].iov_base = old_rq->rq_iov[0].iov_base;
+ iov[0].iov_len = old_rq->rq_iov[0].iov_len;
+
new_rq->rq_iov = iov;
- new_rq->rq_nvec = old_rq->rq_nvec;
+ new_rq->rq_nvec = old_rq->rq_nvec + 1;
tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL);
if (!tr_hdr)
goto err_free_iov;
- /* fill the 1st iov with a transform header */
- fill_transform_hdr(server, tr_hdr, old_rq);
- new_rq->rq_iov[0].iov_base = tr_hdr;
- new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+ /* fill the 2nd iov with a transform header */
+ fill_transform_hdr(tr_hdr, orig_len, old_rq);
+ new_rq->rq_iov[1].iov_base = tr_hdr;
+ new_rq->rq_iov[1].iov_len = sizeof(struct smb2_transform_hdr);
+
+ /* Update rfc1002 header */
+ inc_rfc1001_len(new_rq->rq_iov[0].iov_base,
+ sizeof(struct smb2_transform_hdr));
/* copy pages form the old */
for (i = 0; i < npages; i++) {
@@ -2307,7 +2415,7 @@ smb3_free_transform_rq(struct smb_rqst *rqst)
put_page(rqst->rq_pages[i]);
kfree(rqst->rq_pages);
/* free transform header */
- kfree(rqst->rq_iov[0].iov_base);
+ kfree(rqst->rq_iov[1].iov_base);
kfree(rqst->rq_iov);
}
@@ -2324,18 +2432,19 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
unsigned int buf_data_size, struct page **pages,
unsigned int npages, unsigned int page_data_size)
{
- struct kvec iov[2];
+ struct kvec iov[3];
struct smb_rqst rqst = {NULL};
- struct smb2_hdr *hdr;
int rc;
- iov[0].iov_base = buf;
- iov[0].iov_len = sizeof(struct smb2_transform_hdr);
- iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
- iov[1].iov_len = buf_data_size;
+ iov[0].iov_base = NULL;
+ iov[0].iov_len = 0;
+ iov[1].iov_base = buf;
+ iov[1].iov_len = sizeof(struct smb2_transform_hdr);
+ iov[2].iov_base = buf + sizeof(struct smb2_transform_hdr);
+ iov[2].iov_len = buf_data_size;
rqst.rq_iov = iov;
- rqst.rq_nvec = 2;
+ rqst.rq_nvec = 3;
rqst.rq_pages = pages;
rqst.rq_npages = npages;
rqst.rq_pagesz = PAGE_SIZE;
@@ -2347,10 +2456,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
if (rc)
return rc;
- memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size);
- hdr = (struct smb2_hdr *)buf;
- hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size);
- server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size;
+ memmove(buf, iov[2].iov_base, buf_data_size);
+
+ server->total_read = buf_data_size + page_data_size;
return rc;
}
@@ -2375,7 +2483,7 @@ read_data_into_pages(struct TCP_Server_Info *server, struct page **pages,
zero_user(page, len, PAGE_SIZE - len);
len = 0;
}
- length = cifs_read_page_from_socket(server, page, n);
+ length = cifs_read_page_from_socket(server, page, 0, n);
if (length < 0)
return length;
server->total_read += length;
@@ -2423,7 +2531,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int cur_page_idx;
unsigned int pad_len;
struct cifs_readdata *rdata = mid->callback_data;
- struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
struct bio_vec *bvec = NULL;
struct iov_iter iter;
struct kvec iov;
@@ -2454,7 +2562,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size;
+ data_offset = server->ops->read_data_offset(buf);
#ifdef CONFIG_CIFS_SMB_DIRECT
use_rdma_mr = rdata->mr;
#endif
@@ -2550,12 +2658,11 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
unsigned int npages;
struct page **pages;
unsigned int len;
- unsigned int buflen = get_rfc1002_length(buf) + server->vals->header_preamble_size;
+ unsigned int buflen = server->pdu_size;
int rc;
int i = 0;
- len = min_t(unsigned int, buflen, server->vals->read_rsp_size -
- server->vals->header_preamble_size +
+ len = min_t(unsigned int, buflen, server->vals->read_rsp_size +
sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len);
@@ -2563,8 +2670,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
return rc;
server->total_read += rc;
- len = le32_to_cpu(tr_hdr->OriginalMessageSize) +
- server->vals->header_preamble_size -
+ len = le32_to_cpu(tr_hdr->OriginalMessageSize) -
server->vals->read_rsp_size;
npages = DIV_ROUND_UP(len, PAGE_SIZE);
@@ -2591,8 +2697,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
if (rc)
goto free_pages;
- rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size -
- server->vals->header_preamble_size,
+ rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size,
pages, npages, len);
if (rc)
goto free_pages;
@@ -2624,12 +2729,12 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
{
int length;
char *buf = server->smallbuf;
- unsigned int pdu_length = get_rfc1002_length(buf);
+ unsigned int pdu_length = server->pdu_size;
unsigned int buf_size;
struct mid_q_entry *mid_entry;
/* switch to large buffer if too big for a small one */
- if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE) {
server->large_buf = true;
memcpy(server->bigbuf, buf, server->total_read);
buf = server->bigbuf;
@@ -2637,13 +2742,12 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
/* now read the rest */
length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
- pdu_length - HEADER_SIZE(server) + 1 +
- server->vals->header_preamble_size);
+ pdu_length - HEADER_SIZE(server) + 1);
if (length < 0)
return length;
server->total_read += length;
- buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr);
+ buf_size = pdu_length - sizeof(struct smb2_transform_hdr);
length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0);
if (length)
return length;
@@ -2668,11 +2772,11 @@ static int
smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
{
char *buf = server->smallbuf;
- unsigned int pdu_length = get_rfc1002_length(buf);
+ unsigned int pdu_length = server->pdu_size;
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
- if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) +
+ if (pdu_length < sizeof(struct smb2_transform_hdr) +
sizeof(struct smb2_sync_hdr)) {
cifs_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
@@ -2681,14 +2785,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
return -ECONNABORTED;
}
- if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) {
+ if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
cifs_dbg(VFS, "Transform message is broken\n");
cifs_reconnect(server);
wake_up(&server->response_q);
return -ECONNABORTED;
}
- if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
return receive_encrypted_read(server, mid);
return receive_encrypted_standard(server, mid);
@@ -2699,11 +2803,23 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
- return handle_read_data(server, mid, buf, get_rfc1002_length(buf) +
- server->vals->header_preamble_size,
+ return handle_read_data(server, mid, buf, server->pdu_size,
NULL, 0, 0);
}
+static int
+smb2_next_header(char *buf)
+{
+ struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+ struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
+
+ if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
+ return sizeof(struct smb2_transform_hdr) +
+ le32_to_cpu(t_hdr->OriginalMessageSize);
+
+ return le32_to_cpu(hdr->NextCommand);
+}
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -2795,6 +2911,7 @@ struct smb_version_operations smb20_operations = {
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
+ .next_header = smb2_next_header,
};
struct smb_version_operations smb21_operations = {
@@ -2889,6 +3006,7 @@ struct smb_version_operations smb21_operations = {
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
+ .next_header = smb2_next_header,
};
struct smb_version_operations smb30_operations = {
@@ -2993,6 +3111,7 @@ struct smb_version_operations smb30_operations = {
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
+ .next_header = smb2_next_header,
};
#ifdef CONFIG_CIFS_SMB311
@@ -3093,6 +3212,7 @@ struct smb_version_operations smb311_operations = {
.query_all_EAs = smb2_query_eas,
.set_EA = smb2_set_ea,
#endif /* CIFS_XATTR */
+ .next_header = smb2_next_header,
};
#endif /* CIFS_SMB311 */
@@ -3104,8 +3224,8 @@ struct smb_version_values smb20_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3125,8 +3245,8 @@ struct smb_version_values smb21_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3146,8 +3266,8 @@ struct smb_version_values smb3any_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3167,8 +3287,8 @@ struct smb_version_values smbdefault_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3188,8 +3308,8 @@ struct smb_version_values smb30_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3209,8 +3329,8 @@ struct smb_version_values smb302_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
@@ -3231,8 +3351,8 @@ struct smb_version_values smb311_values = {
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
- .header_size = sizeof(struct smb2_hdr),
- .header_preamble_size = 4,
+ .header_size = sizeof(struct smb2_sync_hdr),
+ .header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
.lock_cmd = SMB2_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f7741cee2a4c..281fbc1dc720 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -49,6 +49,7 @@
#include "cifspdu.h"
#include "cifs_spnego.h"
#include "smbdirect.h"
+#include "trace.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -79,7 +80,7 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
/* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */
};
-static int encryption_required(const struct cifs_tcon *tcon)
+static int smb3_encryption_required(const struct cifs_tcon *tcon)
{
if (!tcon)
return 0;
@@ -145,7 +146,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
if (tcon->ses && tcon->ses->server && tcon->ses->server->sign &&
- !encryption_required(tcon))
+ !smb3_encryption_required(tcon))
shdr->Flags |= SMB2_FLAGS_SIGNED;
out:
return;
@@ -268,8 +269,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
mutex_unlock(&tcon->ses->session_mutex);
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
- if (rc)
+ if (rc) {
+ /* If sess reconnected but tcon didn't, something strange ... */
+ printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc);
goto out;
+ }
if (smb2_command != SMB2_INTERNAL_CMD)
queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -364,6 +368,7 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100)
static void
build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
@@ -380,10 +385,17 @@ static void
build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
- pneg_ctxt->DataLength = cpu_to_le16(6);
- pneg_ctxt->CipherCount = cpu_to_le16(2);
- pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
- pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+ pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */
+ pneg_ctxt->CipherCount = cpu_to_le16(1);
+/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */
+ pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM;
+}
+
+static void
+build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
+{
+ pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE;
+ pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
}
static void
@@ -391,18 +403,179 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
unsigned int *total_len)
{
char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT;
+ unsigned int ctxt_len;
+ *total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
- /* Add 2 to size to round to 8 byte boundary */
+ ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
- pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context);
build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
+ ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+
+ build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
+ *total_len += sizeof(struct smb2_posix_neg_context);
+
req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
- req->NegotiateContextCount = cpu_to_le16(2);
+ req->NegotiateContextCount = cpu_to_le16(3);
+}
+
+static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
+{
+ unsigned int len = le16_to_cpu(ctxt->DataLength);
+
+ /* If invalid preauth context warn but use what we requested, SHA-512 */
+ if (len < MIN_PREAUTH_CTXT_DATA_LEN) {
+ printk_once(KERN_WARNING "server sent bad preauth context\n");
+ return;
+ }
+ if (le16_to_cpu(ctxt->HashAlgorithmCount) != 1)
+ printk_once(KERN_WARNING "illegal SMB3 hash algorithm count\n");
+ if (ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512)
+ printk_once(KERN_WARNING "unknown SMB3 hash algorithm\n");
+}
+
+static int decode_encrypt_ctx(struct TCP_Server_Info *server,
+ struct smb2_encryption_neg_context *ctxt)
+{
+ unsigned int len = le16_to_cpu(ctxt->DataLength);
+
+ cifs_dbg(FYI, "decode SMB3.11 encryption neg context of len %d\n", len);
+ if (len < MIN_ENCRYPT_CTXT_DATA_LEN) {
+ printk_once(KERN_WARNING "server sent bad crypto ctxt len\n");
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(ctxt->CipherCount) != 1) {
+ printk_once(KERN_WARNING "illegal SMB3.11 cipher count\n");
+ return -EINVAL;
+ }
+ cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0]));
+ if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) &&
+ (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) {
+ printk_once(KERN_WARNING "invalid SMB3.11 cipher returned\n");
+ return -EINVAL;
+ }
+ server->cipher_type = ctxt->Ciphers[0];
+ server->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+ return 0;
+}
+
+static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
+ struct TCP_Server_Info *server,
+ unsigned int len_of_smb)
+{
+ struct smb2_neg_context *pctx;
+ unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset);
+ unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount);
+ unsigned int len_of_ctxts, i;
+ int rc = 0;
+
+ cifs_dbg(FYI, "decoding %d negotiate contexts\n", ctxt_cnt);
+ if (len_of_smb <= offset) {
+ cifs_dbg(VFS, "Invalid response: negotiate context offset\n");
+ return -EINVAL;
+ }
+
+ len_of_ctxts = len_of_smb - offset;
+
+ for (i = 0; i < ctxt_cnt; i++) {
+ int clen;
+ /* check that offset is not beyond end of SMB */
+ if (len_of_ctxts == 0)
+ break;
+
+ if (len_of_ctxts < sizeof(struct smb2_neg_context))
+ break;
+
+ pctx = (struct smb2_neg_context *)(offset + (char *)rsp);
+ clen = le16_to_cpu(pctx->DataLength);
+ if (clen > len_of_ctxts)
+ break;
+
+ if (pctx->ContextType == SMB2_PREAUTH_INTEGRITY_CAPABILITIES)
+ decode_preauth_context(
+ (struct smb2_preauth_neg_context *)pctx);
+ else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES)
+ rc = decode_encrypt_ctx(server,
+ (struct smb2_encryption_neg_context *)pctx);
+ else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE)
+ server->posix_ext_supported = true;
+ else
+ cifs_dbg(VFS, "unknown negcontext of type %d ignored\n",
+ le16_to_cpu(pctx->ContextType));
+
+ if (rc)
+ break;
+ /* offsets must be 8 byte aligned */
+ clen = (clen + 7) & ~0x7;
+ offset += clen + sizeof(struct smb2_neg_context);
+ len_of_ctxts -= clen;
+ }
+ return rc;
+}
+
+static struct create_posix *
+create_posix_buf(umode_t mode)
+{
+ struct create_posix *buf;
+
+ buf = kzalloc(sizeof(struct create_posix),
+ GFP_KERNEL);
+ if (!buf)
+ return NULL;
- *total_len += 4 + sizeof(struct smb2_preauth_neg_context)
- + sizeof(struct smb2_encryption_neg_context);
+ buf->ccontext.DataOffset =
+ cpu_to_le16(offsetof(struct create_posix, Mode));
+ buf->ccontext.DataLength = cpu_to_le32(4);
+ buf->ccontext.NameOffset =
+ cpu_to_le16(offsetof(struct create_posix, Name));
+ buf->ccontext.NameLength = cpu_to_le16(16);
+
+ /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */
+ buf->Name[0] = 0x93;
+ buf->Name[1] = 0xAD;
+ buf->Name[2] = 0x25;
+ buf->Name[3] = 0x50;
+ buf->Name[4] = 0x9C;
+ buf->Name[5] = 0xB4;
+ buf->Name[6] = 0x11;
+ buf->Name[7] = 0xE7;
+ buf->Name[8] = 0xB4;
+ buf->Name[9] = 0x23;
+ buf->Name[10] = 0x83;
+ buf->Name[11] = 0xDE;
+ buf->Name[12] = 0x96;
+ buf->Name[13] = 0x8B;
+ buf->Name[14] = 0xCD;
+ buf->Name[15] = 0x7C;
+ buf->Mode = cpu_to_le32(mode);
+ cifs_dbg(FYI, "mode on posix create 0%o", mode);
+ return buf;
+}
+
+static int
+add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_posix_buf(mode);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_posix);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) +
+ iov[num - 1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix));
+ *num_iovec = num + 1;
+ return 0;
}
+
#else
static void assemble_neg_contexts(struct smb2_negotiate_req *req,
unsigned int *total_len)
@@ -593,7 +766,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
- &rsp->hdr);
+ (struct smb2_sync_hdr *)rsp);
/*
* See MS-SMB2 section 2.2.4: if no blob, client picks default which
* for us will be
@@ -616,6 +789,16 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
else if (rc == 0)
rc = -EIO;
}
+
+#ifdef CONFIG_CIFS_SMB311
+ if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
+ if (rsp->NegotiateContextCount)
+ rc = smb311_decode_neg_context(rsp, server,
+ rsp_iov.iov_len);
+ else
+ cifs_dbg(VFS, "Missing expected negotiate contexts\n");
+ }
+#endif /* CONFIG_CIFS_SMB311 */
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -623,19 +806,14 @@ neg_exit:
int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
{
- int rc = 0;
- struct validate_negotiate_info_req vneg_inbuf;
+ int rc;
+ struct validate_negotiate_info_req *pneg_inbuf;
struct validate_negotiate_info_rsp *pneg_rsp = NULL;
u32 rsplen;
u32 inbuflen; /* max of 4 dialects */
cifs_dbg(FYI, "validate negotiate\n");
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (tcon->ses->server->rdma)
- return 0;
-#endif
-
/* In SMB3.11 preauth integrity supersedes validate negotiate */
if (tcon->ses->server->dialect == SMB311_PROT_ID)
return 0;
@@ -658,63 +836,69 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL)
cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n");
- vneg_inbuf.Capabilities =
+ pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS);
+ if (!pneg_inbuf)
+ return -ENOMEM;
+
+ pneg_inbuf->Capabilities =
cpu_to_le32(tcon->ses->server->vals->req_capabilities);
- memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid,
+ memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid,
SMB2_CLIENT_GUID_SIZE);
if (tcon->ses->sign)
- vneg_inbuf.SecurityMode =
+ pneg_inbuf->SecurityMode =
cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
else if (global_secflags & CIFSSEC_MAY_SIGN)
- vneg_inbuf.SecurityMode =
+ pneg_inbuf->SecurityMode =
cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
else
- vneg_inbuf.SecurityMode = 0;
+ pneg_inbuf->SecurityMode = 0;
if (strcmp(tcon->ses->server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
- vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
- vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
- vneg_inbuf.DialectCount = cpu_to_le16(2);
+ pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
+ pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
+ pneg_inbuf->DialectCount = cpu_to_le16(2);
/* structure is big enough for 3 dialects, sending only 2 */
- inbuflen = sizeof(struct validate_negotiate_info_req) - 2;
+ inbuflen = sizeof(*pneg_inbuf) -
+ sizeof(pneg_inbuf->Dialects[0]);
} else if (strcmp(tcon->ses->server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0) {
- vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
- vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
- vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
- vneg_inbuf.DialectCount = cpu_to_le16(3);
+ pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
+ pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID);
+ pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
+ pneg_inbuf->DialectCount = cpu_to_le16(3);
/* structure is big enough for 3 dialects */
- inbuflen = sizeof(struct validate_negotiate_info_req);
+ inbuflen = sizeof(*pneg_inbuf);
} else {
/* otherwise specific dialect was requested */
- vneg_inbuf.Dialects[0] =
+ pneg_inbuf->Dialects[0] =
cpu_to_le16(tcon->ses->server->vals->protocol_id);
- vneg_inbuf.DialectCount = cpu_to_le16(1);
+ pneg_inbuf->DialectCount = cpu_to_le16(1);
/* structure is big enough for 3 dialects, sending only 1 */
- inbuflen = sizeof(struct validate_negotiate_info_req) - 4;
+ inbuflen = sizeof(*pneg_inbuf) -
+ sizeof(pneg_inbuf->Dialects[0]) * 2;
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
- (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
- (char **)&pneg_rsp, &rsplen);
+ (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
if (rc != 0) {
cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
- return -EIO;
+ rc = -EIO;
+ goto out_free_inbuf;
}
- if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
+ rc = -EIO;
+ if (rsplen != sizeof(*pneg_rsp)) {
cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n",
rsplen);
/* relax check since Mac returns max bufsize allowed on ioctl */
- if ((rsplen > CIFSMaxBufSize)
- || (rsplen < sizeof(struct validate_negotiate_info_rsp)))
- goto err_rsp_free;
+ if (rsplen > CIFSMaxBufSize || rsplen < sizeof(*pneg_rsp))
+ goto out_free_rsp;
}
/* check validate negotiate info response matches what we got earlier */
@@ -731,15 +915,17 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
goto vneg_out;
/* validate negotiate successful */
+ rc = 0;
cifs_dbg(FYI, "validate negotiate info successful\n");
- kfree(pneg_rsp);
- return 0;
+ goto out_free_rsp;
vneg_out:
cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
-err_rsp_free:
+out_free_rsp:
kfree(pneg_rsp);
- return -EIO;
+out_free_inbuf:
+ kfree(pneg_inbuf);
+ return rc;
}
enum securityEnum
@@ -944,7 +1130,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->hdr.sync_hdr.SessionId;
+ ses->Suid = rsp->sync_hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
@@ -1020,13 +1206,13 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
/* If true, rc here is expected and not an error */
if (sess_data->buf0_type != CIFS_NO_BUFFER &&
- rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+ rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
rc = 0;
if (rc)
goto out;
- if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
+ if (offsetof(struct smb2_sess_setup_rsp, Buffer) !=
le16_to_cpu(rsp->SecurityBufferOffset)) {
cifs_dbg(VFS, "Invalid security buffer offset %d\n",
le16_to_cpu(rsp->SecurityBufferOffset));
@@ -1041,7 +1227,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
- ses->Suid = rsp->hdr.sync_hdr.SessionId;
+ ses->Suid = rsp->sync_hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
out:
@@ -1099,7 +1285,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->hdr.sync_hdr.SessionId;
+ ses->Suid = rsp->sync_hdr.SessionId;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
rc = SMB2_sess_establish_session(sess_data);
@@ -1166,6 +1352,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
sess_data->ses = ses;
sess_data->buf0_type = CIFS_NO_BUFFER;
sess_data->nls_cp = (struct nls_table *) nls_cp;
+ sess_data->previous_session = ses->Suid;
#ifdef CONFIG_CIFS_SMB311
/*
@@ -1293,7 +1480,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
return rc;
}
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
iov[0].iov_base = (char *)req;
@@ -1309,7 +1496,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
/* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
if ((ses->server->dialect == SMB311_PROT_ID) &&
- !encryption_required(tcon))
+ !smb3_encryption_required(tcon))
req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
@@ -1347,7 +1534,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tidStatus = CifsGood;
tcon->need_reconnect = false;
- tcon->tid = rsp->hdr.sync_hdr.TreeId;
+ tcon->tid = rsp->sync_hdr.TreeId;
strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1367,7 +1554,7 @@ tcon_exit:
return rc;
tcon_error_exit:
- if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+ if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
}
goto tcon_exit;
@@ -1398,7 +1585,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
flags |= CIFS_NO_RESP;
@@ -1465,7 +1652,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
static __u8
parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
- unsigned int *epoch)
+ unsigned int *epoch, char *lease_key)
{
char *data_offset;
struct create_context *cc;
@@ -1473,14 +1660,15 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
unsigned int remaining;
char *name;
- data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset);
+ data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
if (le16_to_cpu(cc->NameLength) == 4 &&
strncmp(name, "RqLs", 4) == 0)
- return server->ops->parse_lease_buf(cc, epoch);
+ return server->ops->parse_lease_buf(cc, epoch,
+ lease_key);
next = le32_to_cpu(cc->Next);
if (!next)
@@ -1701,14 +1889,14 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
int
SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
__u8 *oplock, struct smb2_file_all_info *buf,
- struct smb2_err_rsp **err_buf)
+ struct kvec *err_iov)
{
struct smb2_create_req *req;
struct smb2_create_rsp *rsp;
struct TCP_Server_Info *server;
struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct kvec iov[4];
+ struct kvec iov[5]; /* make sure at least one for each open context */
struct kvec rsp_iov = {NULL, 0};
int resp_buftype;
int uni_path_len;
@@ -1717,7 +1905,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
int rc = 0;
unsigned int n_iov = 2;
__u32 file_attributes = 0;
- char *dhc_buf = NULL, *lc_buf = NULL;
+ char *dhc_buf = NULL, *lc_buf = NULL, *pc_buf = NULL;
int flags = 0;
unsigned int total_len;
@@ -1733,7 +1921,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
if (oparms->create_options & CREATE_OPTION_READONLY)
@@ -1834,6 +2022,27 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
dhc_buf = iov[n_iov-1].iov_base;
}
+#ifdef CONFIG_CIFS_SMB311
+ if (tcon->posix_extensions) {
+ if (n_iov > 2) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[n_iov-1].iov_base;
+ ccontext->Next =
+ cpu_to_le32(iov[n_iov-1].iov_len);
+ }
+
+ rc = add_posix_context(iov, &n_iov, oparms->mode);
+ if (rc) {
+ cifs_small_buf_release(req);
+ kfree(copy_path);
+ kfree(lc_buf);
+ kfree(dhc_buf);
+ return rc;
+ }
+ pc_buf = iov[n_iov-1].iov_base;
+ }
+#endif /* SMB311 */
+
rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags,
&rsp_iov);
cifs_small_buf_release(req);
@@ -1841,11 +2050,18 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_CREATE_HE);
- if (err_buf && rsp)
- *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4,
- GFP_KERNEL);
+ if (err_iov && rsp) {
+ *err_iov = rsp_iov;
+ resp_buftype = CIFS_NO_BUFFER;
+ rsp = NULL;
+ }
+ trace_smb3_open_err(xid, tcon->tid, ses->Suid,
+ oparms->create_options, oparms->desired_access, rc);
goto creat_exit;
- }
+ } else
+ trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
+ ses->Suid, oparms->create_options,
+ oparms->desired_access);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
@@ -1860,13 +2076,15 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
}
if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
- *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
+ *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch,
+ oparms->fid->lease_key);
else
*oplock = rsp->OplockLevel;
creat_exit:
kfree(copy_path);
kfree(lc_buf);
kfree(dhc_buf);
+ kfree(pc_buf);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -1882,7 +2100,6 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
{
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp;
- struct smb2_sync_hdr *shdr;
struct cifs_ses *ses;
struct kvec iov[2];
struct kvec rsp_iov;
@@ -1913,7 +2130,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->CtlCode = cpu_to_le32(opcode);
@@ -1976,6 +2193,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
cifs_small_buf_release(req);
rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
+ if (rc != 0)
+ trace_smb3_fsctl_err(xid, persistent_fid, tcon->tid,
+ ses->Suid, 0, opcode, rc);
+
if ((rc != 0) && (rc != -EINVAL)) {
cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
goto ioctl_exit;
@@ -2003,7 +2224,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+ if (rsp_iov.iov_len < le32_to_cpu(rsp->OutputOffset) + *plen) {
cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
le32_to_cpu(rsp->OutputOffset));
*plen = 0;
@@ -2017,8 +2238,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- shdr = get_sync_hdr(rsp);
- memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen);
+ memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen);
ioctl_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -2050,8 +2270,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
}
int
-SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid)
+SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, int flags)
{
struct smb2_close_req *req;
struct smb2_close_rsp *rsp;
@@ -2060,7 +2280,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec rsp_iov;
int resp_buftype;
int rc = 0;
- int flags = 0;
unsigned int total_len;
cifs_dbg(FYI, "Close\n");
@@ -2072,7 +2291,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->PersistentFileId = persistent_fid;
@@ -2087,6 +2306,8 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+ trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid,
+ rc);
goto close_exit;
}
@@ -2097,14 +2318,20 @@ close_exit:
return rc;
}
-static int
-validate_buf(unsigned int offset, unsigned int buffer_length,
- struct smb2_hdr *hdr, unsigned int min_buf_size)
+int
+SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid)
+{
+ return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+}
+static int
+validate_iov(unsigned int offset, unsigned int buffer_length,
+ struct kvec *iov, unsigned int min_buf_size)
{
- unsigned int smb_len = be32_to_cpu(hdr->smb2_buf_length);
- char *end_of_smb = smb_len + 4 /* RFC1001 length field */ + (char *)hdr;
- char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr;
+ unsigned int smb_len = iov->iov_len;
+ char *end_of_smb = smb_len + (char *)iov->iov_base;
+ char *begin_of_buf = offset + (char *)iov->iov_base;
char *end_of_buf = begin_of_buf + buffer_length;
@@ -2134,18 +2361,17 @@ validate_buf(unsigned int offset, unsigned int buffer_length,
* Caller must free buffer.
*/
static int
-validate_and_copy_buf(unsigned int offset, unsigned int buffer_length,
- struct smb2_hdr *hdr, unsigned int minbufsize,
+validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
+ struct kvec *iov, unsigned int minbufsize,
char *data)
-
{
- char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr;
+ char *begin_of_buf = offset + (char *)iov->iov_base;
int rc;
if (!data)
return -EINVAL;
- rc = validate_buf(offset, buffer_length, hdr, minbufsize);
+ rc = validate_iov(offset, buffer_length, iov, minbufsize);
if (rc)
return rc;
@@ -2180,7 +2406,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->InfoType = info_type;
@@ -2206,6 +2432,8 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+ trace_smb3_query_info_err(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type, rc);
goto qinf_exit;
}
@@ -2223,9 +2451,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
}
}
- rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset),
+ rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength),
- &rsp->hdr, min_len, *data);
+ &rsp_iov, min_len, *data);
qinf_exit:
free_rsp_buf(resp_buftype, rsp);
@@ -2294,7 +2522,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
unsigned int credits_received = 1;
if (mid->mid_state == MID_RESPONSE_RECEIVED)
- credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
+ credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
DeleteMidQEntry(mid);
add_credits(server, credits_received, CIFS_ECHO_OP);
@@ -2423,7 +2651,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->PersistentFileId = persistent_fid;
@@ -2435,8 +2663,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
cifs_small_buf_release(req);
- if (rc != 0)
+ if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
+ trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid,
+ rc);
+ }
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc;
@@ -2481,7 +2712,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* If we want to do a RDMA write, fill in and append
* smbd_buffer_descriptor_v1 to the end of read request
*/
- if (server->rdma && rdata &&
+ if (server->rdma && rdata && !server->sign &&
rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
struct smbd_buffer_descriptor_v1 *v1;
@@ -2545,11 +2776,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rdata->iov[1].iov_base;
+ (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
unsigned int credits_received = 1;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
+ .rq_offset = rdata->page_offset,
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
@@ -2647,7 +2879,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
return rc;
}
- if (encryption_required(io_parms.tcon))
+ if (smb3_encryption_required(io_parms.tcon))
flags |= CIFS_TRANSFORM_REQ;
req_len = cpu_to_be32(total_len);
@@ -2678,7 +2910,13 @@ smb2_async_readv(struct cifs_readdata *rdata)
if (rc) {
kref_put(&rdata->refcount, cifs_readdata_release);
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
- }
+ trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
+ io_parms.tcon->tid, io_parms.tcon->ses->Suid,
+ io_parms.offset, io_parms.length);
+ } else
+ trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
+ io_parms.tcon->tid, io_parms.tcon->ses->Suid,
+ io_parms.offset, io_parms.length);
cifs_small_buf_release(buf);
return rc;
@@ -2691,7 +2929,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
int resp_buftype, rc = -EACCES;
struct smb2_read_plain_req *req = NULL;
struct smb2_read_rsp *rsp = NULL;
- struct smb2_sync_hdr *shdr;
struct kvec iov[1];
struct kvec rsp_iov;
unsigned int total_len;
@@ -2703,7 +2940,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
if (rc)
return rc;
- if (encryption_required(io_parms->tcon))
+ if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
iov[0].iov_base = (char *)req;
@@ -2719,9 +2956,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
cifs_dbg(VFS, "Send error in read = %d\n", rc);
}
+ trace_smb3_read_err(rc, xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
return rc == -ENODATA ? 0 : rc;
- }
+ } else
+ trace_smb3_read_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid, ses->Suid,
+ io_parms->offset, io_parms->length);
*nbytes = le32_to_cpu(rsp->DataLength);
if ((*nbytes > CIFS_MAX_MSGSIZE) ||
@@ -2732,10 +2975,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
*nbytes = 0;
}
- shdr = get_sync_hdr(rsp);
-
if (*buf) {
- memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes);
+ memcpy(*buf, (char *)rsp + rsp->DataOffset, *nbytes);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
} else if (resp_buftype != CIFS_NO_BUFFER) {
*buf = rsp_iov.iov_base;
@@ -2762,7 +3003,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
+ credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
if (wdata->result != 0)
break;
@@ -2839,7 +3080,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
goto async_writev_out;
}
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
shdr = (struct smb2_sync_hdr *)req;
@@ -2859,7 +3100,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
* If we want to do a server RDMA read, fill in and append
* smbd_buffer_descriptor_v1 to the end of write request
*/
- if (server->rdma && wdata->bytes >=
+ if (server->rdma && !server->sign && wdata->bytes >=
server->smbd_conn->rdma_readwrite_threshold) {
struct smbd_buffer_descriptor_v1 *v1;
@@ -2900,6 +3141,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
rqst.rq_pages = wdata->pages;
+ rqst.rq_offset = wdata->page_offset;
rqst.rq_npages = wdata->nr_pages;
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
@@ -2937,9 +3179,15 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata, flags);
if (rc) {
+ trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+ tcon->tid, tcon->ses->Suid, wdata->offset,
+ wdata->bytes, rc);
kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- }
+ } else
+ trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
+ tcon->tid, tcon->ses->Suid, wdata->offset,
+ wdata->bytes);
async_writev_out:
cifs_small_buf_release(req);
@@ -2977,7 +3225,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
if (io_parms->tcon->ses->server == NULL)
return -ECONNABORTED;
- if (encryption_required(io_parms->tcon))
+ if (smb3_encryption_required(io_parms->tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
@@ -3003,10 +3251,19 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
+ trace_smb3_write_err(xid, req->PersistentFileId,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, io_parms->length, rc);
cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
cifs_dbg(VFS, "Send error in write = %d\n", rc);
- } else
+ } else {
*nbytes = le32_to_cpu(rsp->DataLength);
+ trace_smb3_write_done(xid, req->PersistentFileId,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset, *nbytes);
+ }
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -3087,7 +3344,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
switch (srch_inf->info_level) {
@@ -3138,7 +3395,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
if (rc == -ENODATA &&
- rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
srch_inf->endOfSearch = true;
rc = 0;
}
@@ -3146,8 +3403,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
goto qdir_exit;
}
- rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
+ rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
+ le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
info_buf_size);
if (rc)
goto qdir_exit;
@@ -3161,10 +3418,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
cifs_buf_release(srch_inf->ntwrk_buf_start);
}
srch_inf->ntwrk_buf_start = (char *)rsp;
- srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ +
- (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset);
- /* 4 for rfc1002 length field */
- end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr;
+ srch_inf->srch_entries_start = srch_inf->last_entry =
+ (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset);
+ end_of_smb = rsp_iov.iov_len + (char *)rsp;
srch_inf->entries_in_buffer =
num_entries(srch_inf->srch_entries_start, end_of_smb,
&srch_inf->last_entry, info_buf_size);
@@ -3219,7 +3475,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(pid);
@@ -3252,8 +3508,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_small_buf_release(req);
rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
- if (rc != 0)
+ if (rc != 0) {
cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
+ trace_smb3_set_info_err(xid, persistent_fid, tcon->tid,
+ ses->Suid, info_class, (__u32)info_type, rc);
+ }
free_rsp_buf(resp_buftype, rsp);
kfree(iov);
@@ -3400,7 +3659,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock_level)
{
int rc;
- struct smb2_oplock_break_req *req = NULL;
+ struct smb2_oplock_break *req = NULL;
struct cifs_ses *ses = tcon->ses;
int flags = CIFS_OBREAK_OP;
unsigned int total_len;
@@ -3414,7 +3673,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->VolatileFid = volatile_fid;
@@ -3454,7 +3713,7 @@ static int
build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
int outbuf_len, u64 persistent_fid, u64 volatile_fid)
{
- struct TCP_Server_Info *server = tcon->ses->server;
+ struct TCP_Server_Info *server;
int rc;
struct smb2_query_info_req *req;
unsigned int total_len;
@@ -3464,6 +3723,8 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
return -EIO;
+ server = tcon->ses->server;
+
rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req,
&total_len);
if (rc)
@@ -3477,7 +3738,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
req->InputBufferOffset =
cpu_to_le16(sizeof(struct smb2_query_info_req) - 1);
req->OutputBufferLength = cpu_to_le32(
- outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size);
+ outbuf_len + sizeof(struct smb2_query_info_rsp) - 1);
iov->iov_base = (char *)req;
iov->iov_len = total_len;
@@ -3494,7 +3755,6 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
struct smb2_fs_full_size_info *info = NULL;
int flags = 0;
@@ -3504,7 +3764,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
@@ -3515,10 +3775,10 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
}
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
- info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size +
- le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
- rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr,
+ info = (struct smb2_fs_full_size_info *)(
+ le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
+ rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
+ le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
sizeof(struct smb2_fs_full_size_info));
if (!rc)
copy_fs_info_to_kstatfs(info, fsdata);
@@ -3538,7 +3798,6 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype, max_len, min_len;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
unsigned int rsp_len, offset;
int flags = 0;
@@ -3561,7 +3820,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
@@ -3574,20 +3833,20 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
rsp_len = le32_to_cpu(rsp->OutputBufferLength);
offset = le16_to_cpu(rsp->OutputBufferOffset);
- rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len);
+ rc = validate_iov(offset, rsp_len, &rsp_iov, min_len);
if (rc)
goto qfsattr_exit;
if (level == FS_ATTRIBUTE_INFORMATION)
- memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset
- + (char *)&rsp->hdr, min_t(unsigned int,
+ memcpy(&tcon->fsAttrInfo, offset
+ + (char *)rsp, min_t(unsigned int,
rsp_len, max_len));
else if (level == FS_DEVICE_INFORMATION)
- memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset
- + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+ memcpy(&tcon->fsDevInfo, offset
+ + (char *)rsp, sizeof(FILE_SYSTEM_DEVICE_INFO));
else if (level == FS_SECTOR_SIZE_INFORMATION) {
struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
- (server->vals->header_preamble_size + offset + (char *)&rsp->hdr);
+ (offset + (char *)rsp);
tcon->ss_flags = le32_to_cpu(ss_info->Flags);
tcon->perf_sector_size =
le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
@@ -3618,7 +3877,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(pid);
@@ -3641,6 +3900,8 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
+ trace_smb3_lock_err(xid, persist_fid, tcon->tid,
+ tcon->ses->Suid, rc);
}
return rc;
@@ -3682,7 +3943,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
return rc;
- if (encryption_required(tcon))
+ if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.CreditRequest = cpu_to_le16(1);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 253e2c7c952f..a345560001ce 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -122,25 +122,10 @@ struct smb2_sync_pdu {
__le16 StructureSize2; /* size of wct area (varies, request specific) */
} __packed;
-struct smb2_hdr {
- __be32 smb2_buf_length; /* big endian on wire */
- /* length is only two or three bytes - with */
- /* one or two byte type preceding it that MBZ */
- struct smb2_sync_hdr sync_hdr;
-} __packed;
-
-struct smb2_pdu {
- struct smb2_hdr hdr;
- __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
#define SMB3_AES128CMM_NONCE 11
#define SMB3_AES128GCM_NONCE 12
struct smb2_transform_hdr {
- __be32 smb2_buf_length; /* big endian on wire */
- /* length is only two or three bytes - with
- one or two byte type preceding it that MBZ */
__le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
__u8 Signature[16];
__u8 Nonce[16];
@@ -171,7 +156,7 @@ struct smb2_transform_hdr {
#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
struct smb2_err_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize;
__le16 Reserved; /* MBZ */
__le32 ByteCount; /* even if zero, at least one byte follows */
@@ -263,11 +248,19 @@ struct smb2_negotiate_req {
#define SMB2_NT_FIND 0x00100000
#define SMB2_LARGE_FILES 0x00200000
+struct smb2_neg_context {
+ __le16 ContextType;
+ __le16 DataLength;
+ __le32 Reserved;
+ /* Followed by array of data */
+} __packed;
+
#define SMB311_SALT_SIZE 32
/* Hash Algorithm Types */
#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
#define SMB2_PREAUTH_HASH_SIZE 64
+#define MIN_PREAUTH_CTXT_DATA_LEN (SMB311_SALT_SIZE + 6)
struct smb2_preauth_neg_context {
__le16 ContextType; /* 1 */
__le16 DataLength;
@@ -282,16 +275,26 @@ struct smb2_preauth_neg_context {
#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
+#define MIN_ENCRYPT_CTXT_DATA_LEN 4
struct smb2_encryption_neg_context {
__le16 ContextType; /* 2 */
__le16 DataLength;
__le32 Reserved;
__le16 CipherCount; /* AES-128-GCM and AES-128-CCM */
- __le16 Ciphers[2]; /* Ciphers[0] since only one used now */
+ __le16 Ciphers[1]; /* Ciphers[0] since only one used now */
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN 8
+struct smb2_posix_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le64 Reserved1; /* In case needed for future (eg version or caps) */
} __packed;
struct smb2_negotiate_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 65 */
__le16 SecurityMode;
__le16 DialectRevision;
@@ -331,7 +334,7 @@ struct smb2_sess_setup_req {
#define SMB2_SESSION_FLAG_IS_NULL 0x0002
#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
struct smb2_sess_setup_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 SessionFlags;
__le16 SecurityBufferOffset;
@@ -346,7 +349,7 @@ struct smb2_logoff_req {
} __packed;
struct smb2_logoff_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -442,7 +445,7 @@ struct smb2_tree_connect_req_extension {
} __packed;
struct smb2_tree_connect_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 16 */
__u8 ShareType; /* see below */
__u8 Reserved;
@@ -493,7 +496,7 @@ struct smb2_tree_disconnect_req {
} __packed;
struct smb2_tree_disconnect_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -605,7 +608,9 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
-#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9
+#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
+#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
+
struct smb2_create_req {
struct smb2_sync_hdr sync_hdr;
@@ -628,7 +633,7 @@ struct smb2_create_req {
} __packed;
struct smb2_create_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 89 */
__u8 OplockLevel;
__u8 Reserved;
@@ -717,6 +722,13 @@ struct create_durable {
} Data;
} __packed;
+struct create_posix {
+ struct create_context ccontext;
+ __u8 Name[16];
+ __le32 Mode;
+ __u32 Reserved;
+} __packed;
+
/* See MS-SMB2 2.2.13.2.11 */
/* Flags */
#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
@@ -884,7 +896,7 @@ struct smb2_ioctl_req {
} __packed;
struct smb2_ioctl_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 57 */
__u16 Reserved;
__le32 CtlCode;
@@ -911,7 +923,7 @@ struct smb2_close_req {
} __packed;
struct smb2_close_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* 60 */
__le16 Flags;
__le32 Reserved;
@@ -934,7 +946,7 @@ struct smb2_flush_req {
} __packed;
struct smb2_flush_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize;
__le16 Reserved;
} __packed;
@@ -966,7 +978,7 @@ struct smb2_read_plain_req {
} __packed;
struct smb2_read_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 17 */
__u8 DataOffset;
__u8 Reserved;
@@ -997,7 +1009,7 @@ struct smb2_write_req {
} __packed;
struct smb2_write_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 17 */
__u8 DataOffset;
__u8 Reserved;
@@ -1031,7 +1043,7 @@ struct smb2_lock_req {
} __packed;
struct smb2_lock_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__le16 Reserved;
} __packed;
@@ -1043,7 +1055,7 @@ struct smb2_echo_req {
} __packed;
struct smb2_echo_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 4 */
__u16 Reserved;
} __packed;
@@ -1069,7 +1081,7 @@ struct smb2_query_directory_req {
} __packed;
struct smb2_query_directory_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1118,7 +1130,7 @@ struct smb2_query_info_req {
} __packed;
struct smb2_query_info_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
@@ -1140,12 +1152,11 @@ struct smb2_set_info_req {
} __packed;
struct smb2_set_info_rsp {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 2 */
} __packed;
-/* oplock break without an rfc1002 header */
-struct smb2_oplock_break_req {
+struct smb2_oplock_break {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 24 */
__u8 OplockLevel;
@@ -1155,21 +1166,10 @@ struct smb2_oplock_break_req {
__u64 VolatileFid;
} __packed;
-/* oplock break with an rfc1002 header */
-struct smb2_oplock_break_rsp {
- struct smb2_hdr hdr;
- __le16 StructureSize; /* Must be 24 */
- __u8 OplockLevel;
- __u8 Reserved;
- __le32 Reserved2;
- __u64 PersistentFid;
- __u64 VolatileFid;
-} __packed;
-
#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
struct smb2_lease_break {
- struct smb2_hdr hdr;
+ struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 44 */
__le16 Reserved;
__le32 Flags;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index cbcce3f7e86f..908555b1c6b5 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -36,8 +36,9 @@ struct smb_rqst;
extern int map_smb2_to_linux_error(char *buf, bool log_err);
extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
-extern unsigned int smb2_calc_size(void *buf);
-extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
+extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
+extern char *smb2_get_data_area_len(int *off, int *len,
+ struct smb2_sync_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
struct cifs_sb_info *cifs_sb);
@@ -65,6 +66,8 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
+extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_fid *pfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
@@ -122,13 +125,15 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
__le16 *path, __u8 *oplock,
struct smb2_file_all_info *buf,
- struct smb2_err_rsp **err_buf);
+ struct kvec *err_iov);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
bool is_fsctl, char *in_data, u32 indatalen,
char **out_data, u32 *plen /* returned data len */);
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, int flags);
extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index bf49cb73b9e6..2c671123a6bf 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -480,7 +480,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
unsigned int rc;
char server_response_sig[16];
struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
+ (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
if ((shdr->Command == SMB2_NEGOTIATE) ||
(shdr->Command == SMB2_SESSION_SETUP) ||
@@ -604,15 +604,13 @@ int
smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
bool log_error)
{
- unsigned int len = get_rfc1002_length(mid->resp_buf);
- struct kvec iov[2];
+ unsigned int len = mid->resp_buf_size;
+ struct kvec iov[1];
struct smb_rqst rqst = { .rq_iov = iov,
- .rq_nvec = 2 };
+ .rq_nvec = 1 };
iov[0].iov_base = (char *)mid->resp_buf;
- iov[0].iov_len = 4;
- iov[1].iov_base = (char *)mid->resp_buf + 4;
- iov[1].iov_len = len;
+ iov[0].iov_len = len;
dump_smb(mid->resp_buf, min_t(u32, 80, len));
/* convert the length into a more usable form */
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5008af546dd1..c62f7c95683c 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1028,7 +1028,7 @@ static int smbd_post_send(struct smbd_connection *info,
for (i = 0; i < request->num_sge; i++) {
log_rdma_send(INFO,
"rdma_request sge[%d] addr=%llu length=%u\n",
- i, request->sge[0].addr, request->sge[0].length);
+ i, request->sge[i].addr, request->sge[i].length);
ib_dma_sync_single_for_device(
info->id->device,
request->sge[i].addr,
@@ -2086,7 +2086,7 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
int start, i, j;
int max_iov_size =
info->max_send_size - sizeof(struct smbd_data_transfer);
- struct kvec iov[SMBDIRECT_MAX_SGE];
+ struct kvec *iov;
int rc;
info->smbd_send_pending++;
@@ -2096,32 +2096,20 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
}
/*
- * This usually means a configuration error
- * We use RDMA read/write for packet size > rdma_readwrite_threshold
- * as long as it's properly configured we should never get into this
- * situation
- */
- if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) {
- log_write(ERR, "maximum send segment %x exceeding %x\n",
- rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE);
- rc = -EINVAL;
- goto done;
- }
-
- /*
- * Remove the RFC1002 length defined in MS-SMB2 section 2.1
- * It is used only for TCP transport
+ * Skip the RFC1002 length defined in MS-SMB2 section 2.1
+ * It is used only for TCP transport in the iov[0]
* In future we may want to add a transport layer under protocol
* layer so this will only be issued to TCP transport
*/
- iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4;
- iov[0].iov_len = rqst->rq_iov[0].iov_len - 4;
- buflen += iov[0].iov_len;
+
+ if (rqst->rq_iov[0].iov_len != 4) {
+ log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len);
+ return -EINVAL;
+ }
+ iov = &rqst->rq_iov[1];
/* total up iov array first */
- for (i = 1; i < rqst->rq_nvec; i++) {
- iov[i].iov_base = rqst->rq_iov[i].iov_base;
- iov[i].iov_len = rqst->rq_iov[i].iov_len;
+ for (i = 0; i < rqst->rq_nvec-1; i++) {
buflen += iov[i].iov_len;
}
@@ -2139,6 +2127,10 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
goto done;
}
+ cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen);
+ for (i = 0; i < rqst->rq_nvec-1; i++)
+ dump_smb(iov[i].iov_base, iov[i].iov_len);
+
remaining_data_length = buflen;
log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
@@ -2194,12 +2186,14 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
goto done;
}
i++;
+ if (i == rqst->rq_nvec-1)
+ break;
}
start = i;
buflen = 0;
} else {
i++;
- if (i == rqst->rq_nvec) {
+ if (i == rqst->rq_nvec-1) {
/* send out all remaining vecs */
remaining_data_length -= buflen;
log_write(INFO,
diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c
new file mode 100644
index 000000000000..bd4a546feec1
--- /dev/null
+++ b/fs/cifs/trace.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Microsoft Corporation.
+ *
+ * Author(s): Steve French <stfrench@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
new file mode 100644
index 000000000000..61e74d455d90
--- /dev/null
+++ b/fs/cifs/trace.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018, Microsoft Corporation.
+ *
+ * Author(s): Steve French <stfrench@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cifs
+
+#if !defined(_CIFS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CIFS_TRACE_H
+
+#include <linux/tracepoint.h>
+
+/* For logging errors in read or write */
+DECLARE_EVENT_CLASS(smb3_rw_err_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len,
+ int rc),
+ TP_ARGS(xid, fid, tid, sesid, offset, len, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->rc = rc;
+ ),
+ TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len, __entry->rc)
+)
+
+#define DEFINE_SMB3_RW_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len, \
+ int rc), \
+ TP_ARGS(xid, fid, tid, sesid, offset, len, rc))
+
+DEFINE_SMB3_RW_ERR_EVENT(write_err);
+DEFINE_SMB3_RW_ERR_EVENT(read_err);
+
+
+/* For logging successful read or write */
+DECLARE_EVENT_CLASS(smb3_rw_done_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset,
+ __u32 len),
+ TP_ARGS(xid, fid, tid, sesid, offset, len),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ __field(__u32, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_SMB3_RW_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset, \
+ __u32 len), \
+ TP_ARGS(xid, fid, tid, sesid, offset, len))
+
+DEFINE_SMB3_RW_DONE_EVENT(write_done);
+DEFINE_SMB3_RW_DONE_EVENT(read_done);
+
+/*
+ * For handle based calls other than read and write, and get/set info
+ */
+DECLARE_EVENT_CLASS(smb3_fd_err_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ int rc),
+ TP_ARGS(xid, fid, tid, sesid, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->rc = rc;
+ ),
+ TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->rc)
+)
+
+#define DEFINE_SMB3_FD_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_fd_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int rc), \
+ TP_ARGS(xid, fid, tid, sesid, rc))
+
+DEFINE_SMB3_FD_ERR_EVENT(flush_err);
+DEFINE_SMB3_FD_ERR_EVENT(lock_err);
+DEFINE_SMB3_FD_ERR_EVENT(close_err);
+
+/*
+ * For handle based query/set info calls
+ */
+DECLARE_EVENT_CLASS(smb3_inf_err_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u8 infclass,
+ __u32 type,
+ int rc),
+ TP_ARGS(xid, fid, tid, sesid, infclass, type, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u8, infclass)
+ __field(__u32, type)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->infclass = infclass;
+ __entry->type = type;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->infclass, __entry->type, __entry->rc)
+)
+
+#define DEFINE_SMB3_INF_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_inf_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u8 infclass, \
+ __u32 type, \
+ int rc), \
+ TP_ARGS(xid, fid, tid, sesid, infclass, type, rc))
+
+DEFINE_SMB3_INF_ERR_EVENT(query_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(set_info_err);
+DEFINE_SMB3_INF_ERR_EVENT(fsctl_err);
+
+/*
+ * For logging SMB3 Status code and Command for responses which return errors
+ */
+DECLARE_EVENT_CLASS(smb3_cmd_err_class,
+ TP_PROTO(__u32 tid,
+ __u64 sesid,
+ __u16 cmd,
+ __u64 mid,
+ __u32 status,
+ int rc),
+ TP_ARGS(tid, sesid, cmd, mid, status, rc),
+ TP_STRUCT__entry(
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u16, cmd)
+ __field(__u64, mid)
+ __field(__u32, status)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->cmd = cmd;
+ __entry->mid = mid;
+ __entry->status = status;
+ __entry->rc = rc;
+ ),
+ TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
+ __entry->sesid, __entry->tid, __entry->cmd, __entry->mid,
+ __entry->status, __entry->rc)
+)
+
+#define DEFINE_SMB3_CMD_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_cmd_err_class, smb3_##name, \
+ TP_PROTO(__u32 tid, \
+ __u64 sesid, \
+ __u16 cmd, \
+ __u64 mid, \
+ __u32 status, \
+ int rc), \
+ TP_ARGS(tid, sesid, cmd, mid, status, rc))
+
+DEFINE_SMB3_CMD_ERR_EVENT(cmd_err);
+
+DECLARE_EVENT_CLASS(smb3_cmd_done_class,
+ TP_PROTO(__u32 tid,
+ __u64 sesid,
+ __u16 cmd,
+ __u64 mid),
+ TP_ARGS(tid, sesid, cmd, mid),
+ TP_STRUCT__entry(
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u16, cmd)
+ __field(__u64, mid)
+ ),
+ TP_fast_assign(
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->cmd = cmd;
+ __entry->mid = mid;
+ ),
+ TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu",
+ __entry->sesid, __entry->tid,
+ __entry->cmd, __entry->mid)
+)
+
+#define DEFINE_SMB3_CMD_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \
+ TP_PROTO(__u32 tid, \
+ __u64 sesid, \
+ __u16 cmd, \
+ __u64 mid), \
+ TP_ARGS(tid, sesid, cmd, mid))
+
+DEFINE_SMB3_CMD_DONE_EVENT(cmd_done);
+
+DECLARE_EVENT_CLASS(smb3_exit_err_class,
+ TP_PROTO(unsigned int xid,
+ const char *func_name,
+ int rc),
+ TP_ARGS(xid, func_name, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(const char *, func_name)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->func_name = func_name;
+ __entry->rc = rc;
+ ),
+ TP_printk("\t%s: xid=%u rc=%d",
+ __entry->func_name, __entry->xid, __entry->rc)
+)
+
+#define DEFINE_SMB3_EXIT_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_exit_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ const char *func_name, \
+ int rc), \
+ TP_ARGS(xid, func_name, rc))
+
+DEFINE_SMB3_EXIT_ERR_EVENT(exit_err);
+
+DECLARE_EVENT_CLASS(smb3_enter_exit_class,
+ TP_PROTO(unsigned int xid,
+ const char *func_name),
+ TP_ARGS(xid, func_name),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(const char *, func_name)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->func_name = func_name;
+ ),
+ TP_printk("\t%s: xid=%u",
+ __entry->func_name, __entry->xid)
+)
+
+#define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \
+DEFINE_EVENT(smb3_enter_exit_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ const char *func_name), \
+ TP_ARGS(xid, func_name))
+
+DEFINE_SMB3_ENTER_EXIT_EVENT(enter);
+DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done);
+
+/*
+ * For smb2/smb3 open call
+ */
+DECLARE_EVENT_CLASS(smb3_open_err_class,
+ TP_PROTO(unsigned int xid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access,
+ int rc),
+ TP_ARGS(xid, tid, sesid, create_options, desired_access, rc),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ __entry->rc = rc;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x rc=%d",
+ __entry->xid, __entry->sesid, __entry->tid,
+ __entry->create_options, __entry->desired_access, __entry->rc)
+)
+
+#define DEFINE_SMB3_OPEN_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_open_err_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access, \
+ int rc), \
+ TP_ARGS(xid, tid, sesid, create_options, desired_access, rc))
+
+DEFINE_SMB3_OPEN_ERR_EVENT(open_err);
+
+
+DECLARE_EVENT_CLASS(smb3_open_done_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ int create_options,
+ int desired_access),
+ TP_ARGS(xid, fid, tid, sesid, create_options, desired_access),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(int, create_options)
+ __field(int, desired_access)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->create_options = create_options;
+ __entry->desired_access = desired_access;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx cr_opts=0x%x des_access=0x%x",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->create_options, __entry->desired_access)
+)
+
+#define DEFINE_SMB3_OPEN_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_open_done_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ int create_options, \
+ int desired_access), \
+ TP_ARGS(xid, fid, tid, sesid, create_options, desired_access))
+
+DEFINE_SMB3_OPEN_DONE_EVENT(open_done);
+
+#endif /* _CIFS_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 279718dcb2ed..e7254e386b79 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -753,7 +753,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
goto out;
#ifdef CONFIG_CIFS_SMB311
- if (ses->status == CifsNew)
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
smb311_update_preauth_hash(ses, rqst->rq_iov+1,
rqst->rq_nvec-1);
#endif
@@ -790,7 +790,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
buf = (char *)midQ->resp_buf;
resp_iov->iov_base = buf;
- resp_iov->iov_len = get_rfc1002_length(buf) +
+ resp_iov->iov_len = midQ->resp_buf_size +
ses->server->vals->header_preamble_size;
if (midQ->large_buf)
*resp_buf_type = CIFS_LARGE_BUFFER;
@@ -798,10 +798,10 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
*resp_buf_type = CIFS_SMALL_BUFFER;
#ifdef CONFIG_CIFS_SMB311
- if (ses->status == CifsNew) {
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
struct kvec iov = {
- .iov_base = buf + 4,
- .iov_len = get_rfc1002_length(buf)
+ .iov_base = buf,
+ .iov_len = midQ->resp_buf_size
};
smb311_update_preauth_hash(ses, &iov, 1);
}
@@ -834,8 +834,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
if (n_vec + 1 > CIFS_MAX_IOV_SIZE) {
new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1),
GFP_KERNEL);
- if (!new_iov)
+ if (!new_iov) {
+ /* otherwise cifs_send_recv below sets resp_buf_type */
+ *resp_buf_type = CIFS_NO_BUFFER;
return -ENOMEM;
+ }
} else
new_iov = s_iov;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 017b0ab19bc4..c4fb9ad7c808 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -492,7 +492,7 @@ static void cramfs_kill_sb(struct super_block *sb)
{
struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
- if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) {
+ if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) {
if (sbi && sbi->mtd_point_size)
mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
kill_mtd_super(sb);
@@ -808,10 +808,7 @@ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, un
}
out:
mutex_unlock(&read_mutex);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int cramfs_readpage(struct file *file, struct page *page)
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ce654526c0fb..243a269e6c5f 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -156,12 +156,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
}
req = skcipher_request_alloc(tfm, gfp_flags);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n",
- __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
@@ -178,9 +174,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_skcipher_encrypt() returned %d\n",
- __func__, res);
+ fscrypt_err(inode->i_sb,
+ "%scryption failed for inode %lu, block %llu: %d",
+ (rw == FS_DECRYPT ? "de" : "en"),
+ inode->i_ino, lblk_num, res);
return res;
}
return 0;
@@ -326,7 +323,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
- /* this should eventually be an flag in d_flags */
spin_lock(&dentry->d_lock);
cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
spin_unlock(&dentry->d_lock);
@@ -353,7 +349,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
-EXPORT_SYMBOL(fscrypt_d_ops);
void fscrypt_restore_control_page(struct page *page)
{
@@ -422,13 +417,43 @@ fail:
return res;
}
+void fscrypt_msg(struct super_block *sb, const char *level,
+ const char *fmt, ...)
+{
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&rs))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf);
+ else
+ printk("%sfscrypt: %pV\n", level, &vaf);
+ va_end(args);
+}
+
/**
* fscrypt_init() - Set up for fs encryption.
*/
static int __init fscrypt_init(void)
{
+ /*
+ * Use an unbound workqueue to allow bios to be decrypted in parallel
+ * even when they happen to complete on the same CPU. This sacrifices
+ * locality, but it's worthwhile since decryption is CPU-intensive.
+ *
+ * Also use a high-priority workqueue to prioritize decryption work,
+ * which blocks reads from completing, over regular application tasks.
+ */
fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
- WQ_HIGHPRI, 0);
+ WQ_UNBOUND | WQ_HIGHPRI,
+ num_online_cpus());
if (!fscrypt_read_workqueue)
goto fail;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index e33f3d3c5ade..d7a0f682ca12 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -59,11 +59,8 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
/* Set up the encryption request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: skcipher_request_alloc() failed\n", __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
@@ -74,8 +71,9 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- printk_ratelimited(KERN_ERR
- "%s: Error (error code %d)\n", __func__, res);
+ fscrypt_err(inode->i_sb,
+ "Filename encryption failed for inode %lu: %d",
+ inode->i_ino, res);
return res;
}
@@ -96,23 +94,14 @@ static int fname_decrypt(struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
- struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
int res = 0;
char iv[FS_CRYPTO_BLOCK_SIZE];
- unsigned lim;
-
- lim = inode->i_sb->s_cop->max_namelen(inode);
- if (iname->len <= 0 || iname->len > lim)
- return -EIO;
/* Allocate request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n", __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
@@ -127,8 +116,9 @@ static int fname_decrypt(struct inode *inode,
res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- printk_ratelimited(KERN_ERR
- "%s: Error (error code %d)\n", __func__, res);
+ fscrypt_err(inode->i_sb,
+ "Filename decryption failed for inode %lu: %d",
+ inode->i_ino, res);
return res;
}
@@ -341,12 +331,12 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return 0;
}
ret = fscrypt_get_encryption_info(dir);
- if (ret && ret != -EOPNOTSUPP)
+ if (ret)
return ret;
if (dir->i_crypt_info) {
if (!fscrypt_fname_encrypted_size(dir, iname->len,
- dir->i_sb->s_cop->max_namelen(dir),
+ dir->i_sb->s_cop->max_namelen,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index ad6722bae8b7..37562394c5de 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -18,15 +18,7 @@
/* Encryption parameters */
#define FS_IV_SIZE 16
-#define FS_AES_128_ECB_KEY_SIZE 16
-#define FS_AES_128_CBC_KEY_SIZE 16
-#define FS_AES_128_CTS_KEY_SIZE 16
-#define FS_AES_256_GCM_KEY_SIZE 32
-#define FS_AES_256_CBC_KEY_SIZE 32
-#define FS_AES_256_CTS_KEY_SIZE 32
-#define FS_AES_256_XTS_KEY_SIZE 64
-
-#define FS_KEY_DERIVATION_NONCE_SIZE 16
+#define FS_KEY_DERIVATION_NONCE_SIZE 16
/**
* Encryption context for inode
@@ -91,6 +83,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
return true;
+ if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS &&
+ filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS)
+ return true;
+
return false;
}
@@ -106,6 +102,15 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
gfp_t gfp_flags);
extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
gfp_t gfp_flags);
+extern const struct dentry_operations fscrypt_d_ops;
+
+extern void __printf(3, 4) __cold
+fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+
+#define fscrypt_warn(sb, fmt, ...) \
+ fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fscrypt_err(sb, fmt, ...) \
+ fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
/* fname.c */
extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index bec06490fb13..926e5df20ec3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -39,8 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp)
dir = dget_parent(file_dentry(filp));
if (IS_ENCRYPTED(d_inode(dir)) &&
!fscrypt_has_permitted_context(d_inode(dir), inode)) {
- pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu",
- d_inode(dir)->i_ino, inode->i_ino);
+ fscrypt_warn(inode->i_sb,
+ "inconsistent encryption contexts: %lu/%lu",
+ d_inode(dir)->i_ino, inode->i_ino);
err = -EPERM;
}
dput(dir);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 05f5ee1f0705..e997ca51192f 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -19,17 +19,16 @@
static struct crypto_shash *essiv_hash_tfm;
-/**
- * derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivation.
- * @source_key: Source key to which to apply derivation.
- * @derived_raw_key: Derived raw key.
+/*
+ * Key derivation function. This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the inode's nonce as the AES key.
*
- * Return: Zero on success; non-zero otherwise.
+ * The master key must be at least as long as the derived key. If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
*/
-static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
- const struct fscrypt_key *source_key,
- u8 derived_raw_key[FS_MAX_KEY_SIZE])
+static int derive_key_aes(const u8 *master_key,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, unsigned int derived_keysize)
{
int res = 0;
struct skcipher_request *req = NULL;
@@ -51,14 +50,13 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, deriving_key,
- FS_AES_128_ECB_KEY_SIZE);
+ res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce));
if (res < 0)
goto out;
- sg_init_one(&src_sg, source_key->raw, source_key->size);
- sg_init_one(&dst_sg, derived_raw_key, source_key->size);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
NULL);
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
out:
@@ -67,101 +65,147 @@ out:
return res;
}
-static int validate_user_key(struct fscrypt_info *crypt_info,
- struct fscrypt_context *ctx, u8 *raw_key,
- const char *prefix, int min_keysize)
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+ const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
+ unsigned int min_keysize,
+ const struct fscrypt_key **payload_ret)
{
char *description;
- struct key *keyring_key;
- struct fscrypt_key *master_key;
+ struct key *key;
const struct user_key_payload *ukp;
- int res;
+ const struct fscrypt_key *payload;
description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
- FS_KEY_DESCRIPTOR_SIZE,
- ctx->master_key_descriptor);
+ FS_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- keyring_key = request_key(&key_type_logon, description, NULL);
+ key = request_key(&key_type_logon, description, NULL);
kfree(description);
- if (IS_ERR(keyring_key))
- return PTR_ERR(keyring_key);
- down_read(&keyring_key->sem);
-
- if (keyring_key->type != &key_type_logon) {
- printk_once(KERN_WARNING
- "%s: key type must be logon\n", __func__);
- res = -ENOKEY;
- goto out;
- }
- ukp = user_key_payload_locked(keyring_key);
- if (!ukp) {
- /* key was revoked before we acquired its semaphore */
- res = -EKEYREVOKED;
- goto out;
+ if (IS_ERR(key))
+ return key;
+
+ down_read(&key->sem);
+ ukp = user_key_payload_locked(key);
+
+ if (!ukp) /* was the key revoked before we acquired its semaphore? */
+ goto invalid;
+
+ payload = (const struct fscrypt_key *)ukp->data;
+
+ if (ukp->datalen != sizeof(struct fscrypt_key) ||
+ payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
+ fscrypt_warn(NULL,
+ "key with description '%s' has invalid payload",
+ key->description);
+ goto invalid;
}
- if (ukp->datalen != sizeof(struct fscrypt_key)) {
- res = -EINVAL;
- goto out;
+
+ if (payload->size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+ key->description, payload->size, min_keysize);
+ goto invalid;
}
- master_key = (struct fscrypt_key *)ukp->data;
- BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
-
- if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE
- || master_key->size % AES_BLOCK_SIZE != 0) {
- printk_once(KERN_WARNING
- "%s: key size incorrect: %d\n",
- __func__, master_key->size);
- res = -ENOKEY;
- goto out;
+
+ *payload_ret = payload;
+ return key;
+
+invalid:
+ up_read(&key->sem);
+ key_put(key);
+ return ERR_PTR(-ENOKEY);
+}
+
+/* Find the master key, then derive the inode's actual encryption key */
+static int find_and_derive_key(const struct inode *inode,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, unsigned int derived_keysize)
+{
+ struct key *key;
+ const struct fscrypt_key *payload;
+ int err;
+
+ key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+ ctx->master_key_descriptor,
+ derived_keysize, &payload);
+ if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
+ key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
+ ctx->master_key_descriptor,
+ derived_keysize, &payload);
}
- res = derive_key_aes(ctx->nonce, master_key, raw_key);
-out:
- up_read(&keyring_key->sem);
- key_put(keyring_key);
- return res;
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize);
+ up_read(&key->sem);
+ key_put(key);
+ return err;
}
-static const struct {
+static struct fscrypt_mode {
+ const char *friendly_name;
const char *cipher_str;
int keysize;
+ bool logged_impl_name;
} available_modes[] = {
- [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)",
- FS_AES_256_XTS_KEY_SIZE },
- [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))",
- FS_AES_256_CTS_KEY_SIZE },
- [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)",
- FS_AES_128_CBC_KEY_SIZE },
- [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
- FS_AES_128_CTS_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_256_XTS] = {
+ .friendly_name = "AES-256-XTS",
+ .cipher_str = "xts(aes)",
+ .keysize = 64,
+ },
+ [FS_ENCRYPTION_MODE_AES_256_CTS] = {
+ .friendly_name = "AES-256-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 32,
+ },
+ [FS_ENCRYPTION_MODE_AES_128_CBC] = {
+ .friendly_name = "AES-128-CBC",
+ .cipher_str = "cbc(aes)",
+ .keysize = 16,
+ },
+ [FS_ENCRYPTION_MODE_AES_128_CTS] = {
+ .friendly_name = "AES-128-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 16,
+ },
+ [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = {
+ .friendly_name = "Speck128/256-XTS",
+ .cipher_str = "xts(speck128)",
+ .keysize = 64,
+ },
+ [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = {
+ .friendly_name = "Speck128/256-CTS-CBC",
+ .cipher_str = "cts(cbc(speck128))",
+ .keysize = 32,
+ },
};
-static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
- const char **cipher_str_ret, int *keysize_ret)
+static struct fscrypt_mode *
+select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
{
- u32 mode;
-
if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
- pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n",
- inode->i_ino,
- ci->ci_data_mode, ci->ci_filename_mode);
- return -EINVAL;
+ fscrypt_warn(inode->i_sb,
+ "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)",
+ inode->i_ino, ci->ci_data_mode,
+ ci->ci_filename_mode);
+ return ERR_PTR(-EINVAL);
}
- if (S_ISREG(inode->i_mode)) {
- mode = ci->ci_data_mode;
- } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
- mode = ci->ci_filename_mode;
- } else {
- WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
- inode->i_ino, (inode->i_mode & S_IFMT));
- return -EINVAL;
- }
+ if (S_ISREG(inode->i_mode))
+ return &available_modes[ci->ci_data_mode];
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return &available_modes[ci->ci_filename_mode];
- *cipher_str_ret = available_modes[mode].cipher_str;
- *keysize_ret = available_modes[mode].keysize;
- return 0;
+ WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+ inode->i_ino, (inode->i_mode & S_IFMT));
+ return ERR_PTR(-EINVAL);
}
static void put_crypt_info(struct fscrypt_info *ci)
@@ -184,8 +228,9 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
tfm = crypto_alloc_shash("sha256", 0, 0);
if (IS_ERR(tfm)) {
- pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n",
- PTR_ERR(tfm));
+ fscrypt_warn(NULL,
+ "error allocating SHA-256 transform: %ld",
+ PTR_ERR(tfm));
return PTR_ERR(tfm);
}
prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
@@ -245,8 +290,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
struct fscrypt_info *crypt_info;
struct fscrypt_context ctx;
struct crypto_skcipher *ctfm;
- const char *cipher_str;
- int keysize;
+ struct fscrypt_mode *mode;
u8 *raw_key = NULL;
int res;
@@ -290,57 +334,59 @@ int fscrypt_get_encryption_info(struct inode *inode)
memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
sizeof(crypt_info->ci_master_key));
- res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize);
- if (res)
+ mode = select_encryption_mode(crypt_info, inode);
+ if (IS_ERR(mode)) {
+ res = PTR_ERR(mode);
goto out;
+ }
/*
* This cannot be a stack buffer because it is passed to the scatterlist
* crypto API as part of key derivation.
*/
res = -ENOMEM;
- raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
+ raw_key = kmalloc(mode->keysize, GFP_NOFS);
if (!raw_key)
goto out;
- res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX,
- keysize);
- if (res && inode->i_sb->s_cop->key_prefix) {
- int res2 = validate_user_key(crypt_info, &ctx, raw_key,
- inode->i_sb->s_cop->key_prefix,
- keysize);
- if (res2) {
- if (res2 == -ENOKEY)
- res = -ENOKEY;
- goto out;
- }
- } else if (res) {
+ res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize);
+ if (res)
goto out;
- }
- ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
- if (!ctfm || IS_ERR(ctfm)) {
- res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n",
- __func__, res, inode->i_ino);
+
+ ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ if (IS_ERR(ctfm)) {
+ res = PTR_ERR(ctfm);
+ fscrypt_warn(inode->i_sb,
+ "error allocating '%s' transform for inode %lu: %d",
+ mode->cipher_str, inode->i_ino, res);
goto out;
}
+ if (unlikely(!mode->logged_impl_name)) {
+ /*
+ * fscrypt performance can vary greatly depending on which
+ * crypto algorithm implementation is used. Help people debug
+ * performance problems by logging the ->cra_driver_name the
+ * first time a mode is used. Note that multiple threads can
+ * race here, but it doesn't really matter.
+ */
+ mode->logged_impl_name = true;
+ pr_info("fscrypt: %s using implementation \"%s\"\n",
+ mode->friendly_name,
+ crypto_skcipher_alg(ctfm)->base.cra_driver_name);
+ }
crypt_info->ci_ctfm = ctfm;
- crypto_skcipher_clear_flags(ctfm, ~0);
crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
- /*
- * if the provided key is longer than keysize, we use the first
- * keysize bytes of the derived key only
- */
- res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
+ res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize);
if (res)
goto out;
if (S_ISREG(inode->i_mode) &&
crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
- res = init_essiv_generator(crypt_info, raw_key, keysize);
+ res = init_essiv_generator(crypt_info, raw_key, mode->keysize);
if (res) {
- pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n",
- __func__, res, inode->i_ino);
+ fscrypt_warn(inode->i_sb,
+ "error initializing ESSIV generator for inode %lu: %d",
+ inode->i_ino, res);
goto out;
}
}
diff --git a/fs/dax.c b/fs/dax.c
index 0276df90e86c..aa86d9f971a4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table);
#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
-static unsigned long dax_radix_sector(void *entry)
+static unsigned long dax_radix_pfn(void *entry)
{
return (unsigned long)entry >> RADIX_DAX_SHIFT;
}
-static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
{
return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- ((unsigned long)sector << RADIX_DAX_SHIFT) |
- RADIX_DAX_ENTRY_LOCK);
+ (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
}
static unsigned int dax_radix_order(void *entry)
@@ -159,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
}
/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree. This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
+ * @entry may no longer be the entry at the index in the mapping.
+ * The important information it's conveying is whether the entry at
+ * this index used to be a PMD entry.
*/
static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, void *entry, bool wake_all)
@@ -175,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
- * under mapping->tree_lock, ditto for entry handling in our callers.
+ * under the i_pages lock, ditto for entry handling in our callers.
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
@@ -184,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
- * Check whether the given slot is locked. The function must be called with
- * mapping->tree_lock held
+ * Check whether the given slot is locked. Must be called with the i_pages
+ * lock held.
*/
static inline int slot_locked(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
return entry & RADIX_DAX_ENTRY_LOCK;
}
/*
- * Mark the given slot is locked. The function must be called with
- * mapping->tree_lock held
+ * Mark the given slot as locked. Must be called with the i_pages lock held.
*/
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
return (void *)entry;
}
/*
- * Mark the given slot is unlocked. The function must be called with
- * mapping->tree_lock held
+ * Mark the given slot as unlocked. Must be called with the i_pages lock held.
*/
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
return (void *)entry;
}
@@ -229,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
* put_locked_mapping_entry() when he locked the entry and now wants to
* unlock it.
*
- * The function must be called with mapping->tree_lock held.
+ * Must be called with the i_pages lock held.
*/
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
@@ -242,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
- entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
&slot);
if (!entry ||
WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
@@ -255,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
schedule();
finish_wait(wq, &ewait.wait);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
}
}
@@ -267,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
{
void *entry, **slot;
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ xa_lock_irq(&mapping->i_pages);
+ entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot))) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return;
}
unlock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}
@@ -299,6 +294,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}
+static unsigned long dax_entry_size(void *entry)
+{
+ if (dax_is_zero_entry(entry))
+ return 0;
+ else if (dax_is_empty_entry(entry))
+ return 0;
+ else if (dax_is_pmd_entry(entry))
+ return PMD_SIZE;
+ else
+ return PAGE_SIZE;
+}
+
+static unsigned long dax_radix_end_pfn(void *entry)
+{
+ return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+}
+
+/*
+ * Iterate through all mapped pfns represented by an entry, i.e. skip
+ * 'empty' and 'zero' entries.
+ */
+#define for_each_mapped_pfn(entry, pfn) \
+ for (pfn = dax_radix_pfn(entry); \
+ pfn < dax_radix_end_pfn(entry); pfn++)
+
+static void dax_associate_entry(void *entry, struct address_space *mapping)
+{
+ unsigned long pfn;
+
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return;
+
+ for_each_mapped_pfn(entry, pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ WARN_ON_ONCE(page->mapping);
+ page->mapping = mapping;
+ }
+}
+
+static void dax_disassociate_entry(void *entry, struct address_space *mapping,
+ bool trunc)
+{
+ unsigned long pfn;
+
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return;
+
+ for_each_mapped_pfn(entry, pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+ WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+ page->mapping = NULL;
+ }
+}
+
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
@@ -332,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
void *entry, **slot;
restart:
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
@@ -364,12 +416,12 @@ restart:
if (pmd_downgrade) {
/*
* Make sure 'entry' remains valid while we drop
- * mapping->tree_lock.
+ * the i_pages lock.
*/
entry = lock_slot(mapping, slot);
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
@@ -386,26 +438,27 @@ restart:
put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
if (!entry) {
/*
- * We needed to drop the page_tree lock while calling
+ * We needed to drop the i_pages lock while calling
* radix_tree_preload() and we didn't have an entry to
* lock. See if another thread inserted an entry at
* our index during this time.
*/
- entry = __radix_tree_lookup(&mapping->page_tree, index,
+ entry = __radix_tree_lookup(&mapping->i_pages, index,
NULL, &slot);
if (entry) {
radix_tree_preload_end();
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
goto restart;
}
}
if (pmd_downgrade) {
- radix_tree_delete(&mapping->page_tree, index);
+ dax_disassociate_entry(entry, mapping, false);
+ radix_tree_delete(&mapping->i_pages, index);
mapping->nrexceptional--;
dax_wake_mapping_entry_waiter(mapping, index, entry,
true);
@@ -413,11 +466,11 @@ restart:
entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
- err = __radix_tree_insert(&mapping->page_tree, index,
+ err = __radix_tree_insert(&mapping->i_pages, index,
dax_radix_order(entry), entry);
radix_tree_preload_end();
if (err) {
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
/*
* Our insertion of a DAX entry failed, most likely
* because we were inserting a PMD entry and it
@@ -430,12 +483,12 @@ restart:
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++;
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return entry;
}
entry = lock_slot(mapping, slot);
out_unlock:
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return entry;
}
@@ -444,22 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
{
int ret = 0;
void *entry;
- struct radix_tree_root *page_tree = &mapping->page_tree;
+ struct radix_tree_root *pages = &mapping->i_pages;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
- (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
+ (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
+ radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
goto out;
- radix_tree_delete(page_tree, index);
+ dax_disassociate_entry(entry, mapping, trunc);
+ radix_tree_delete(pages, index);
mapping->nrexceptional--;
ret = 1;
out:
put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(pages);
return ret;
}
/*
@@ -526,12 +580,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
*/
static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf,
- void *entry, sector_t sector,
+ void *entry, pfn_t pfn_t,
unsigned long flags, bool dirty)
{
- struct radix_tree_root *page_tree = &mapping->page_tree;
- void *new_entry;
+ struct radix_tree_root *pages = &mapping->i_pages;
+ unsigned long pfn = pfn_t_to_pfn(pfn_t);
pgoff_t index = vmf->pgoff;
+ void *new_entry;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -545,8 +600,12 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
}
- spin_lock_irq(&mapping->tree_lock);
- new_entry = dax_radix_locked_entry(sector, flags);
+ xa_lock_irq(pages);
+ new_entry = dax_radix_locked_entry(pfn, flags);
+ if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
+ dax_disassociate_entry(entry, mapping, false);
+ dax_associate_entry(new_entry, mapping);
+ }
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
@@ -561,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
void **slot;
void *ret;
- ret = __radix_tree_lookup(page_tree, index, &node, &slot);
+ ret = __radix_tree_lookup(pages, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(page_tree, node, slot,
+ __radix_tree_replace(pages, node, slot,
new_entry, NULL);
entry = new_entry;
}
if (dirty)
- radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(pages);
return entry;
}
@@ -618,7 +677,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
if (pmdp) {
#ifdef CONFIG_FS_DAX_PMD
@@ -657,17 +716,14 @@ unlock_pte:
i_mmap_unlock_read(mapping);
}
-static int dax_writeback_one(struct block_device *bdev,
- struct dax_device *dax_dev, struct address_space *mapping,
- pgoff_t index, void *entry)
+static int dax_writeback_one(struct dax_device *dax_dev,
+ struct address_space *mapping, pgoff_t index, void *entry)
{
- struct radix_tree_root *page_tree = &mapping->page_tree;
- void *entry2, **slot, *kaddr;
- long ret = 0, id;
- sector_t sector;
- pgoff_t pgoff;
+ struct radix_tree_root *pages = &mapping->i_pages;
+ void *entry2, **slot;
+ unsigned long pfn;
+ long ret = 0;
size_t size;
- pfn_t pfn;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -676,17 +732,17 @@ static int dax_writeback_one(struct block_device *bdev,
if (WARN_ON(!radix_tree_exceptional_entry(entry)))
return -EIO;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(pages);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
- * compare sectors as we must not bail out due to difference in lockbit
+ * compare pfns as we must not bail out due to difference in lockbit
* or entry type.
*/
- if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+ if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
goto put_unlocked;
if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
dax_is_zero_entry(entry))) {
@@ -695,7 +751,7 @@ static int dax_writeback_one(struct block_device *bdev,
}
/* Another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
goto put_unlocked;
/* Lock the entry to serialize with page faults */
entry = lock_slot(mapping, slot);
@@ -703,60 +759,40 @@ static int dax_writeback_one(struct block_device *bdev,
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
* actually flush the caches. This is achieved as the calls will look
- * at the entry only under tree_lock and once they do that they will
- * see the entry locked and wait for it to unlock.
+ * at the entry only under the i_pages lock and once they do that
+ * they will see the entry locked and wait for it to unlock.
*/
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
- spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
+ xa_unlock_irq(pages);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
* in the middle of a PMD, the 'index' we are given will be aligned to
- * the start index of the PMD, as will the sector we pull from
- * 'entry'. This allows us to flush for PMD_SIZE and not have to
- * worry about partial PMD writebacks.
+ * the start index of the PMD, as will the pfn we pull from 'entry'.
+ * This allows us to flush for PMD_SIZE and not have to worry about
+ * partial PMD writebacks.
*/
- sector = dax_radix_sector(entry);
+ pfn = dax_radix_pfn(entry);
size = PAGE_SIZE << dax_radix_order(entry);
- id = dax_read_lock();
- ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
- if (ret)
- goto dax_unlock;
-
- /*
- * dax_direct_access() may sleep, so cannot hold tree_lock over
- * its invocation.
- */
- ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
- if (ret < 0)
- goto dax_unlock;
-
- if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
- ret = -EIO;
- goto dax_unlock;
- }
-
- dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
- dax_flush(dax_dev, kaddr, size);
+ dax_mapping_entry_mkclean(mapping, index, pfn);
+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_lock_irq(pages);
+ radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(pages);
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
- dax_unlock:
- dax_read_unlock(id);
put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
put_unlocked_mapping_entry(mapping, index, entry2);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(pages);
return ret;
}
@@ -808,8 +844,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
break;
}
- ret = dax_writeback_one(bdev, dax_dev, mapping,
- indices[i], pvec.pages[i]);
+ ret = dax_writeback_one(dax_dev, mapping, indices[i],
+ pvec.pages[i]);
if (ret < 0) {
mapping_set_error(mapping, ret);
goto out;
@@ -877,6 +913,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
int ret = VM_FAULT_NOPAGE;
struct page *zero_page;
void *entry2;
+ pfn_t pfn;
zero_page = ZERO_PAGE(0);
if (unlikely(!zero_page)) {
@@ -884,14 +921,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
goto out;
}
- entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ pfn = page_to_pfn_t(zero_page);
+ entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
- vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
+ vm_insert_mixed(vmf->vma, vaddr, pfn);
out:
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -1200,8 +1238,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto error_finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry,
- dax_iomap_sector(&iomap, pos),
+ entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
0, write && !sync);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
@@ -1280,13 +1317,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
+ pfn_t pfn;
zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
if (unlikely(!zero_page))
goto fallback;
- ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ pfn = page_to_pfn_t(zero_page);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;
@@ -1409,8 +1448,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry,
- dax_iomap_sector(&iomap, pos),
+ entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD, write && !sync);
if (IS_ERR(entry))
goto finish_iomap;
@@ -1524,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
pgoff_t index = vmf->pgoff;
int vmf_ret, error;
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
/* Did we race with someone splitting entry or so? */
if (!entry ||
(pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
(pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
entry = lock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
switch (pe_size) {
case PE_SIZE_PTE:
error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
diff --git a/fs/dcache.c b/fs/dcache.c
index 593079176123..0e8e5de3c48a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
+static void __d_free_external_name(struct rcu_head *head)
+{
+ struct external_name *name = container_of(head, struct external_name,
+ u.head);
+
+ mod_node_page_state(page_pgdat(virt_to_page(name)),
+ NR_INDIRECTLY_RECLAIMABLE_BYTES,
+ -ksize(name));
+
+ kfree(name);
+}
+
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
- kfree(external_name(dentry));
- kmem_cache_free(dentry_cache, dentry);
+
+ __d_free_external_name(&external_name(dentry)->u.head);
+
+ kmem_cache_free(dentry_cache, dentry);
}
static inline int dname_external(const struct dentry *dentry)
@@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
struct external_name *p;
p = container_of(name->name, struct external_name, name[0]);
if (unlikely(atomic_dec_and_test(&p->u.count)))
- kfree_rcu(p, u.head);
+ call_rcu(&p->u.head, __d_free_external_name);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -566,6 +580,7 @@ static void __dentry_kill(struct dentry *dentry)
spin_unlock(&dentry->d_lock);
if (likely(can_free))
dentry_free(dentry);
+ cond_resched();
}
static struct dentry *__lock_parent(struct dentry *dentry)
@@ -813,30 +828,24 @@ static inline bool fast_dput(struct dentry *dentry)
*/
void dput(struct dentry *dentry)
{
- if (unlikely(!dentry))
- return;
+ while (dentry) {
+ might_sleep();
-repeat:
- might_sleep();
+ rcu_read_lock();
+ if (likely(fast_dput(dentry))) {
+ rcu_read_unlock();
+ return;
+ }
- rcu_read_lock();
- if (likely(fast_dput(dentry))) {
+ /* Slow case: now with the dentry lock held */
rcu_read_unlock();
- return;
- }
- /* Slow case: now with the dentry lock held */
- rcu_read_unlock();
+ if (likely(retain_dentry(dentry))) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
- if (likely(retain_dentry(dentry))) {
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- dentry = dentry_kill(dentry);
- if (dentry) {
- cond_resched();
- goto repeat;
+ dentry = dentry_kill(dentry);
}
}
EXPORT_SYMBOL(dput);
@@ -893,6 +902,35 @@ repeat:
}
EXPORT_SYMBOL(dget_parent);
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+ struct dentry *alias;
+
+ if (hlist_empty(&inode->i_dentry))
+ return NULL;
+ alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+ __dget(alias);
+ return alias;
+}
+
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them. If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
+{
+ struct dentry *de;
+
+ spin_lock(&inode->i_lock);
+ de = __d_find_any_alias(inode);
+ spin_unlock(&inode->i_lock);
+ return de;
+}
+EXPORT_SYMBOL(d_find_any_alias);
+
/**
* d_find_alias - grab a hashed alias of inode
* @inode: inode in question
@@ -909,34 +947,19 @@ EXPORT_SYMBOL(dget_parent);
*/
static struct dentry *__d_find_alias(struct inode *inode)
{
- struct dentry *alias, *discon_alias;
+ struct dentry *alias;
+
+ if (S_ISDIR(inode->i_mode))
+ return __d_find_any_alias(inode);
-again:
- discon_alias = NULL;
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
spin_lock(&alias->d_lock);
- if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
- if (IS_ROOT(alias) &&
- (alias->d_flags & DCACHE_DISCONNECTED)) {
- discon_alias = alias;
- } else {
- __dget_dlock(alias);
- spin_unlock(&alias->d_lock);
- return alias;
- }
- }
- spin_unlock(&alias->d_lock);
- }
- if (discon_alias) {
- alias = discon_alias;
- spin_lock(&alias->d_lock);
- if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+ if (!d_unhashed(alias)) {
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
}
spin_unlock(&alias->d_lock);
- goto again;
}
return NULL;
}
@@ -1191,7 +1214,6 @@ void shrink_dcache_sb(struct super_block *sb)
this_cpu_sub(nr_dentry_unused, freed);
shrink_dentry_list(&dispose);
- cond_resched();
} while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1215,13 +1237,11 @@ enum d_walk_ret {
* @parent: start of walk
* @data: data passed to @enter() and @finish()
* @enter: callback when first entering the dentry
- * @finish: callback when successfully finished the walk
*
- * The @enter() and @finish() callbacks are called with d_lock held.
+ * The @enter() callbacks are called with d_lock held.
*/
static void d_walk(struct dentry *parent, void *data,
- enum d_walk_ret (*enter)(void *, struct dentry *),
- void (*finish)(void *))
+ enum d_walk_ret (*enter)(void *, struct dentry *))
{
struct dentry *this_parent;
struct list_head *next;
@@ -1310,8 +1330,6 @@ ascend:
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
rcu_read_unlock();
- if (finish)
- finish(data);
out_unlock:
spin_unlock(&this_parent->d_lock);
@@ -1360,7 +1378,7 @@ int path_has_submounts(const struct path *parent)
struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
read_seqlock_excl(&mount_lock);
- d_walk(parent->dentry, &data, path_check_mount, NULL);
+ d_walk(parent->dentry, &data, path_check_mount);
read_sequnlock_excl(&mount_lock);
return data.mounted;
@@ -1468,12 +1486,16 @@ void shrink_dcache_parent(struct dentry *parent)
data.start = parent;
data.found = 0;
- d_walk(parent, &data, select_collect, NULL);
- if (!data.found)
- break;
+ d_walk(parent, &data, select_collect);
+
+ if (!list_empty(&data.dispose)) {
+ shrink_dentry_list(&data.dispose);
+ continue;
+ }
- shrink_dentry_list(&data.dispose);
cond_resched();
+ if (!data.found)
+ break;
}
}
EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1504,7 +1526,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
static void do_one_tree(struct dentry *dentry)
{
shrink_dcache_parent(dentry);
- d_walk(dentry, dentry, umount_check, NULL);
+ d_walk(dentry, dentry, umount_check);
d_drop(dentry);
dput(dentry);
}
@@ -1528,79 +1550,48 @@ void shrink_dcache_for_umount(struct super_block *sb)
}
}
-struct detach_data {
- struct select_data select;
- struct dentry *mountpoint;
-};
-static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
+static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
- struct detach_data *data = _data;
-
+ struct dentry **victim = _data;
if (d_mountpoint(dentry)) {
__dget_dlock(dentry);
- data->mountpoint = dentry;
+ *victim = dentry;
return D_WALK_QUIT;
}
-
- return select_collect(&data->select, dentry);
-}
-
-static void check_and_drop(void *_data)
-{
- struct detach_data *data = _data;
-
- if (!data->mountpoint && list_empty(&data->select.dispose))
- __d_drop(data->select.start);
+ return D_WALK_CONTINUE;
}
/**
* d_invalidate - detach submounts, prune dcache, and drop
* @dentry: dentry to invalidate (aka detach, prune and drop)
- *
- * no dcache lock.
- *
- * The final d_drop is done as an atomic operation relative to
- * rename_lock ensuring there are no races with d_set_mounted. This
- * ensures there are no unhashed dentries on the path to a mountpoint.
*/
void d_invalidate(struct dentry *dentry)
{
- /*
- * If it's already been dropped, return OK.
- */
+ bool had_submounts = false;
spin_lock(&dentry->d_lock);
if (d_unhashed(dentry)) {
spin_unlock(&dentry->d_lock);
return;
}
+ __d_drop(dentry);
spin_unlock(&dentry->d_lock);
/* Negative dentries can be dropped without further checks */
- if (!dentry->d_inode) {
- d_drop(dentry);
+ if (!dentry->d_inode)
return;
- }
+ shrink_dcache_parent(dentry);
for (;;) {
- struct detach_data data;
-
- data.mountpoint = NULL;
- INIT_LIST_HEAD(&data.select.dispose);
- data.select.start = dentry;
- data.select.found = 0;
-
- d_walk(dentry, &data, detach_and_collect, check_and_drop);
-
- if (!list_empty(&data.select.dispose))
- shrink_dentry_list(&data.select.dispose);
- else if (!data.mountpoint)
+ struct dentry *victim = NULL;
+ d_walk(dentry, &victim, find_submount);
+ if (!victim) {
+ if (had_submounts)
+ shrink_dcache_parent(dentry);
return;
-
- if (data.mountpoint) {
- detach_mounts(data.mountpoint);
- dput(data.mountpoint);
}
- cond_resched();
+ had_submounts = true;
+ detach_mounts(victim);
+ dput(victim);
}
}
EXPORT_SYMBOL(d_invalidate);
@@ -1617,6 +1608,7 @@ EXPORT_SYMBOL(d_invalidate);
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
+ struct external_name *ext = NULL;
struct dentry *dentry;
char *dname;
int err;
@@ -1637,14 +1629,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_iname;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
- struct external_name *p = kmalloc(size + name->len,
- GFP_KERNEL_ACCOUNT);
- if (!p) {
+
+ ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT);
+ if (!ext) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
- atomic_set(&p->u.count, 1);
- dname = p->name;
+ atomic_set(&ext->u.count, 1);
+ dname = ext->name;
} else {
dname = dentry->d_iname;
}
@@ -1683,6 +1675,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
}
}
+ if (unlikely(ext)) {
+ pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
+ mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
+ ksize(ext));
+ }
+
this_cpu_inc(nr_dentry);
return dentry;
@@ -1879,6 +1877,28 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
}
EXPORT_SYMBOL(d_instantiate);
+/*
+ * This should be equivalent to d_instantiate() + unlock_new_inode(),
+ * with lockdep-related part of unlock_new_inode() done before
+ * anything else. Use that instead of open-coding d_instantiate()/
+ * unlock_new_inode() combinations.
+ */
+void d_instantiate_new(struct dentry *entry, struct inode *inode)
+{
+ BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+ BUG_ON(!inode);
+ lockdep_annotate_inode_mutex_key(inode);
+ security_d_instantiate(entry, inode);
+ spin_lock(&inode->i_lock);
+ __d_instantiate(entry, inode);
+ WARN_ON(!(inode->i_state & I_NEW));
+ inode->i_state &= ~I_NEW;
+ smp_mb();
+ wake_up_bit(&inode->i_state, __I_NEW);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(d_instantiate_new);
+
/**
* d_instantiate_no_diralias - instantiate a non-aliased dentry
* @entry: dentry to complete
@@ -1921,35 +1941,6 @@ struct dentry *d_make_root(struct inode *root_inode)
}
EXPORT_SYMBOL(d_make_root);
-static struct dentry * __d_find_any_alias(struct inode *inode)
-{
- struct dentry *alias;
-
- if (hlist_empty(&inode->i_dentry))
- return NULL;
- alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- __dget(alias);
- return alias;
-}
-
-/**
- * d_find_any_alias - find any alias for a given inode
- * @inode: inode to find an alias for
- *
- * If any aliases exist for the given inode, take and return a
- * reference for one of them. If no aliases exist, return %NULL.
- */
-struct dentry *d_find_any_alias(struct inode *inode)
-{
- struct dentry *de;
-
- spin_lock(&inode->i_lock);
- de = __d_find_any_alias(inode);
- spin_unlock(&inode->i_lock);
- return de;
-}
-EXPORT_SYMBOL(d_find_any_alias);
-
static struct dentry *__d_instantiate_anon(struct dentry *dentry,
struct inode *inode,
bool disconnected)
@@ -2770,7 +2761,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
dentry->d_name.hash_len = target->d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
- kfree_rcu(old_name, u.head);
+ call_rcu(&old_name->u.head, __d_free_external_name);
}
/*
@@ -3092,7 +3083,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
void d_genocide(struct dentry *parent)
{
- d_walk(parent, parent, d_genocide_kill, NULL);
+ d_walk(parent, parent, d_genocide_kill);
}
EXPORT_SYMBOL(d_genocide);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 1f99678ff5d3..4fce1da7db23 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -796,19 +796,13 @@ EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
- char buf[32];
- size_t buf_size;
bool bv;
int r;
bool *val = file->private_data;
struct dentry *dentry = F_DENTRY(file);
- buf_size = min(count, (sizeof(buf)-1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &bv) == 0) {
+ r = kstrtobool_from_user(user_buf, count, &bv);
+ if (!r) {
r = debugfs_file_get(dentry);
if (unlikely(r))
return r;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 13b01351dd1c..a913b12fc7f8 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -512,7 +512,9 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ if (!parent)
+ parent = debugfs_mount->mnt_root;
+ inode->i_mode = S_IFDIR | ((d_inode(parent)->i_mode & 0770));
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ba12ee659673..093fb54cd316 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct bio *bio;
/*
- * bio_alloc() is guaranteed to return a bio when called with
- * __GFP_RECLAIM and we request a valid number of vectors.
+ * bio_alloc() is guaranteed to return a bio when allowed to sleep and
+ * we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
@@ -1177,9 +1177,9 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
- size_t count = iov_iter_count(iter);
+ const size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
- loff_t end = offset + count;
+ const loff_t end = offset + count;
struct dio *dio;
struct dio_submit sdio = { 0, };
struct buffer_head map_bh = { 0, };
@@ -1200,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
}
/* watch out for a 0 len io from a tricksy fs */
- if (iov_iter_rw(iter) == READ && !iov_iter_count(iter))
+ if (iov_iter_rw(iter) == READ && !count)
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1315,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->should_dirty = (iter->type == ITER_IOVEC);
sdio.iter = iter;
- sdio.final_block_in_request =
- (offset + iov_iter_count(iter)) >> blkbits;
+ sdio.final_block_in_request = end >> blkbits;
/*
* In case of non-aligned buffers, we may need 2 more
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5243989a60cc..a5e4a221435c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1037,6 +1037,7 @@ static void sctp_connect_to_sock(struct connection *con)
int result;
int addr_len;
struct socket *sock;
+ struct timeval tv = { .tv_sec = 5, .tv_usec = 0 };
if (con->nodeid == 0) {
log_print("attempt to connect sock 0 foiled");
@@ -1080,11 +1081,22 @@ static void sctp_connect_to_sock(struct connection *con)
log_print("connecting to %d", con->nodeid);
/* Turn off Nagle's algorithm */
- kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+ kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
sizeof(one));
+ /*
+ * Make sock->ops->connect() function return in specified time,
+ * since O_NONBLOCK argument in connect() function does not work here,
+ * then, we should restore the default value of this attribute.
+ */
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
- O_NONBLOCK);
+ 0);
+ memset(&tv, 0, sizeof(tv));
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ sizeof(tv));
+
if (result == -EINPROGRESS)
result = 0;
if (result == 0)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 846ca150d52e..4dd842f72846 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1997,6 +1997,16 @@ out:
return rc;
}
+static bool is_dot_dotdot(const char *name, size_t name_size)
+{
+ if (name_size == 1 && name[0] == '.')
+ return true;
+ else if (name_size == 2 && name[0] == '.' && name[1] == '.')
+ return true;
+
+ return false;
+}
+
/**
* ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
* @plaintext_name: The plaintext name
@@ -2021,13 +2031,21 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
size_t packet_size;
int rc = 0;
- if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
- && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
- && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
- && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
- ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
- const char *orig_name = name;
- size_t orig_name_size = name_size;
+ if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) &&
+ !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)) {
+ if (is_dot_dotdot(name, name_size)) {
+ rc = ecryptfs_copy_filename(plaintext_name,
+ plaintext_name_size,
+ name, name_size);
+ goto out;
+ }
+
+ if (name_size <= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE ||
+ strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+ ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)) {
+ rc = -EINVAL;
+ goto out;
+ }
name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
@@ -2047,12 +2065,9 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
decoded_name,
decoded_name_size);
if (rc) {
- printk(KERN_INFO "%s: Could not parse tag 70 packet "
- "from filename; copying through filename "
- "as-is\n", __func__);
- rc = ecryptfs_copy_filename(plaintext_name,
- plaintext_name_size,
- orig_name, orig_name_size);
+ ecryptfs_printk(KERN_DEBUG,
+ "%s: Could not parse tag 70 packet from filename\n",
+ __func__);
goto out_free;
}
} else {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c74ed3ca3372..b76a9853325e 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -82,17 +82,28 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
buf->sb, lower_name,
lower_namelen);
if (rc) {
- printk(KERN_ERR "%s: Error attempting to decode and decrypt "
- "filename [%s]; rc = [%d]\n", __func__, lower_name,
- rc);
- goto out;
+ if (rc != -EINVAL) {
+ ecryptfs_printk(KERN_DEBUG,
+ "%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n",
+ __func__, lower_name, rc);
+ return rc;
+ }
+
+ /* Mask -EINVAL errors as these are most likely due a plaintext
+ * filename present in the lower filesystem despite filename
+ * encryption being enabled. One unavoidable example would be
+ * the "lost+found" dentry in the root directory of an Ext4
+ * filesystem.
+ */
+ return 0;
}
+
buf->caller->pos = buf->ctx.pos;
rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
kfree(name);
if (!rc)
buf->entries_written++;
-out:
+
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 847904aa63a9..49121e5a8de2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -283,8 +283,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
iget_failed(ecryptfs_inode);
goto out;
}
- unlock_new_inode(ecryptfs_inode);
- d_instantiate(ecryptfs_dentry, ecryptfs_inode);
+ d_instantiate_new(ecryptfs_dentry, ecryptfs_inode);
out:
return rc;
}
@@ -395,8 +394,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
- if (mount_crypt_stat
- && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+ if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &len,
mount_crypt_stat, name, len);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c89a58cfc991..e74fe84d0886 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1880,7 +1880,7 @@ find_next_matching_auth_tok:
candidate_auth_tok = &auth_tok_list_item->auth_tok;
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG,
- "Considering cadidate auth tok:\n");
+ "Considering candidate auth tok:\n");
ecryptfs_dump_auth_tok(candidate_auth_tok);
}
rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08d3bd602f73..61c9514da5e9 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -101,14 +101,20 @@ static int eventfd_release(struct inode *inode, struct file *file)
return 0;
}
-static __poll_t eventfd_poll(struct file *file, poll_table *wait)
+static struct wait_queue_head *
+eventfd_get_poll_head(struct file *file, __poll_t events)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+
+ return &ctx->wqh;
+}
+
+static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
{
struct eventfd_ctx *ctx = file->private_data;
__poll_t events = 0;
u64 count;
- poll_wait(file, &ctx->wqh, wait);
-
/*
* All writes to ctx->count occur within ctx->wqh.lock. This read
* can be done outside ctx->wqh.lock because we know that poll_wait
@@ -305,7 +311,8 @@ static const struct file_operations eventfd_fops = {
.show_fdinfo = eventfd_show_fdinfo,
#endif
.release = eventfd_release,
- .poll = eventfd_poll,
+ .get_poll_head = eventfd_get_poll_head,
+ .poll_mask = eventfd_poll_mask,
.read = eventfd_read,
.write = eventfd_write,
.llseek = noop_llseek,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 602ca4285b2e..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
- return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
- epi->event.events;
+ return vfs_poll(epi->ffd.file, pt) & epi->event.events;
ep = epi->ffd.file->private_data;
poll_wait(epi->ffd.file, &ep->poll_wait, pt);
@@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* The target file descriptor must support poll */
error = -EPERM;
- if (!tf.file->f_op->poll)
+ if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/exec.c b/fs/exec.c
index 7eb8d21bcab9..183059c427b9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* to work from.
*/
limit = _STK_LIM / 4 * 3;
- limit = min(limit, rlimit(RLIMIT_STACK) / 4);
+ limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
if (size > limit)
goto fail;
}
@@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm)
if (!mm)
goto err;
+ /* Save current stack limit for all calculations made during exec. */
+ task_lock(current->group_leader);
+ bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
+ task_unlock(current->group_leader);
+
err = __bprm_mm_init(bprm);
if (err)
goto err;
@@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
- stack_base = rlimit_max(RLIMIT_STACK);
+ stack_base = bprm->rlim_stack.rlim_max;
if (stack_base > STACK_SIZE_MAX)
stack_base = STACK_SIZE_MAX;
@@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
* Align this down to a page boundary as expand_stack
* will align it up.
*/
- rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
+ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
#ifdef CONFIG_STACK_GROWSUP
if (stack_size + stack_expand > rlim_stack)
stack_base = vma->vm_start + rlim_stack;
@@ -895,13 +900,13 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
return -EINVAL;
- ret = security_kernel_read_file(file, id);
+ ret = deny_write_access(file);
if (ret)
return ret;
- ret = deny_write_access(file);
+ ret = security_kernel_read_file(file, id);
if (ret)
- return ret;
+ goto out;
i_size = i_size_read(file_inode(file));
if (max_size > 0 && i_size > max_size) {
@@ -1341,11 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm)
* RLIMIT_STACK, but after the point of no return to avoid
* needing to clean up the change on failure.
*/
- if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM)
- current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM;
+ if (bprm->rlim_stack.rlim_cur > _STK_LIM)
+ bprm->rlim_stack.rlim_cur = _STK_LIM;
}
- arch_pick_mmap_layout(current->mm);
+ arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
current->sas_ss_sp = current->sas_ss_size = 0;
@@ -1378,6 +1383,16 @@ void setup_new_exec(struct linux_binprm * bprm)
}
EXPORT_SYMBOL(setup_new_exec);
+/* Runs immediately before start_thread() takes over. */
+void finalize_exec(struct linux_binprm *bprm)
+{
+ /* Store any stack rlimit changes before starting thread. */
+ task_lock(current->group_leader);
+ current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
+ task_unlock(current->group_leader);
+}
+EXPORT_SYMBOL(finalize_exec);
+
/*
* Prepare credentials and lock ->cred_guard_mutex.
* install_exec_creds() commits the new creds and drops the lock.
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 3c6a9c156b7a..ddbf87246898 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios)
for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, i));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
@@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios)
for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, i));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
@@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, dev));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
@@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
return 0; /* Just an empty slot */
first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
- or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, first_dev));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
return -ENOMEM;
@@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, cur_comp));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
return -ENOMEM;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 179cd5c2f52a..719a3152da80 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
u64 offset, void *p, unsigned length)
{
- struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+ struct osd_request *or = osd_start_request(od);
/* struct osd_sense_info osi = {.key = 0};*/
int ret;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 329a5d103846..645158dc33f1 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -435,6 +435,15 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
if (IS_ERR_OR_NULL(result))
return ERR_PTR(-ESTALE);
+ /*
+ * If no acceptance criteria was specified by caller, a disconnected
+ * dentry is also accepatable. Callers may use this mode to query if
+ * file handle is stale or to get a reference to an inode without
+ * risking the high overhead caused by directory reconnect.
+ */
+ if (!acceptable)
+ return result;
+
if (d_is_dir(result)) {
/*
* This request is for a directory.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 032295e1d386..cc40802ddfa8 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations;
extern const struct file_operations ext2_file_operations;
/* inode.c */
+extern void ext2_set_file_ops(struct inode *inode);
extern const struct address_space_operations ext2_aops;
extern const struct address_space_operations ext2_nobh_aops;
extern const struct iomap_ops ext2_iomap_ops;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 09640220fda8..047c327a6b23 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -88,11 +88,11 @@ out_unlock:
* The default page_lock and i_size verification done by non-DAX fault paths
* is sufficient because ext2 doesn't support hole punching.
*/
-static int ext2_dax_fault(struct vm_fault *vmf)
+static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ext2_inode_info *ei = EXT2_I(inode);
- int ret;
+ vm_fault_t ret;
if (vmf->flags & FAULT_FLAG_WRITE) {
sb_start_pagefault(inode->i_sb);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9b2ac55ac34f..71635909df3b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t offset = iocb->ki_pos;
ssize_t ret;
- if (WARN_ON_ONCE(IS_DAX(inode)))
- return -EIO;
-
ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
if (ret < 0 && iov_iter_rw(iter) == WRITE)
ext2_write_failed(mapping, offset + count);
@@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
-#ifdef CONFIG_FS_DAX
- if (dax_mapping(mapping)) {
- return dax_writeback_mapping_range(mapping,
- mapping->host->i_sb->s_bdev,
- wbc);
- }
-#endif
-
return mpage_writepages(mapping, wbc, ext2_get_block);
}
+static int
+ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ return dax_writeback_mapping_range(mapping,
+ mapping->host->i_sb->s_bdev, wbc);
+}
+
const struct address_space_operations ext2_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
@@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = {
.error_remove_page = generic_error_remove_page,
};
+static const struct address_space_operations ext2_dax_aops = {
+ .writepages = ext2_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .set_page_dirty = noop_set_page_dirty,
+ .invalidatepage = noop_invalidatepage,
+};
+
/*
* Probably it should be a library function... search for first non-zero word
* or memcmp with zero_page, whatever is better for particular architecture.
@@ -1261,21 +1264,11 @@ do_indirects:
static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
- /*
- * XXX: it seems like a bug here that we don't allow
- * IS_APPEND inode to have blocks-past-i_size trimmed off.
- * review and fix this.
- *
- * Also would be nice to be able to handle IO errors and such,
- * but that's probably too much to ask.
- */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
return;
if (ext2_inode_is_fast_symlink(inode))
return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
dax_sem_down_write(EXT2_I(inode));
__ext2_truncate_blocks(inode, offset);
@@ -1388,6 +1381,18 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DAX;
}
+void ext2_set_file_ops(struct inode *inode)
+{
+ inode->i_op = &ext2_file_inode_operations;
+ inode->i_fop = &ext2_file_operations;
+ if (IS_DAX(inode))
+ inode->i_mapping->a_ops = &ext2_dax_aops;
+ else if (test_opt(inode->i_sb, NOBH))
+ inode->i_mapping->a_ops = &ext2_nobh_aops;
+ else
+ inode->i_mapping->a_ops = &ext2_aops;
+}
+
struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
{
struct ext2_inode_info *ei;
@@ -1480,14 +1485,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
ei->i_data[n] = raw_inode->i_block[n];
if (S_ISREG(inode->i_mode)) {
- inode->i_op = &ext2_file_inode_operations;
- if (test_opt(inode->i_sb, NOBH)) {
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- inode->i_fop = &ext2_file_operations;
- } else {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_file_operations;
- }
+ ext2_set_file_ops(inode);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e078075dc66f..152453a91877 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,8 +41,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = ext2_add_link(dentry, inode);
if (!err) {
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
}
inode_dec_link_count(inode);
@@ -107,14 +106,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &ext2_file_inode_operations;
- if (test_opt(inode->i_sb, NOBH)) {
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- inode->i_fop = &ext2_file_operations;
- } else {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_file_operations;
- }
+ ext2_set_file_ops(inode);
mark_inode_dirty(inode);
return ext2_add_nondir(dentry, inode);
}
@@ -125,14 +117,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &ext2_file_inode_operations;
- if (test_opt(inode->i_sb, NOBH)) {
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- inode->i_fop = &ext2_file_operations;
- } else {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_file_operations;
- }
+ ext2_set_file_ops(inode);
mark_inode_dirty(inode);
d_tmpfile(dentry, inode);
unlock_new_inode(inode);
@@ -269,8 +254,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
if (err)
goto out_fail;
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
out:
return err;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index de1694512f1f..c09289a42dc5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
- err = bdev_dax_supported(sb, blocksize);
- if (err) {
+ if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
ext2_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a33d8fb1bf2a..b00481c475cb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -185,25 +185,15 @@ static int ext4_init_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
int flex_bg = 0;
- struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, gdp);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT |
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return -EFSBADCRC;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -321,6 +311,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
+ ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
@@ -338,7 +329,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
/* check whether block bitmap block number is set */
blk = ext4_block_bitmap(sb, desc);
offset = blk - group_first_block;
- if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize ||
+ if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
@@ -346,7 +337,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
/* check whether the inode bitmap block number is set */
blk = ext4_inode_bitmap(sb, desc);
offset = blk - group_first_block;
- if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize ||
+ if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
@@ -354,8 +345,8 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
/* check whether the inode table block number is set */
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
- if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize ||
- EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize)
+ if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
+ EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit)
return blk;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
EXT4_B2C(sbi, offset + sbi->s_itb_per_group),
@@ -374,7 +365,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
- struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return 0;
@@ -386,10 +376,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSBADCRC;
}
blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
@@ -397,10 +385,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSCORRUPTED;
}
set_buffer_verified(bh);
@@ -435,6 +421,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
(bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
ext4_error(sb, "Invalid block bitmap block %llu in "
"block_group %u", bitmap_blk, block_group);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return ERR_PTR(-EFSCORRUPTED);
}
bh = sb_getblk(sb, bitmap_blk);
@@ -513,6 +501,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EIO;
}
clear_buffer_new(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a42e71203e53..df95412915ea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2390,7 +2390,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
-extern const struct file_operations ext4_seq_mb_groups_fops;
+extern const struct seq_operations ext4_mb_seq_groups_ops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_mb_init(struct super_block *);
@@ -2530,6 +2530,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
+extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
+ ext4_group_t block_group,
+ unsigned int flags);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
@@ -2857,6 +2860,10 @@ struct ext4_group_info {
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \
+ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
+ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0a7315961bac..c969275ce3ee 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5329,8 +5329,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
stop = le32_to_cpu(extent->ee_block);
/*
- * In case of left shift, Don't start shifting extents until we make
- * sure the hole is big enough to accommodate the shift.
+ * For left shifts, make sure the hole on the left is big enough to
+ * accommodate the shift. For right shifts, make sure the last extent
+ * won't be shifted beyond EXT_MAX_BLOCKS.
*/
if (SHIFT == SHIFT_LEFT) {
path = ext4_find_extent(inode, start - 1, &path,
@@ -5350,9 +5351,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
if ((start == ex_start && shift > ex_start) ||
(shift > start - ex_end)) {
- ext4_ext_drop_refs(path);
- kfree(path);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ if (shift > EXT_MAX_BLOCKS -
+ (stop + ext4_ext_get_actual_len(extent))) {
+ ret = -EINVAL;
+ goto out;
}
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 763ef185dd17..c4e6fb15101b 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -162,8 +162,7 @@ int __init ext4_init_es(void)
void ext4_exit_es(void)
{
- if (ext4_es_cachep)
- kmem_cache_destroy(ext4_es_cachep);
+ kmem_cache_destroy(ext4_es_cachep);
}
void ext4_es_init_tree(struct ext4_es_tree *tree)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index fb6f023622fe..7f8023340eb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -277,10 +277,11 @@ out:
}
#ifdef CONFIG_FS_DAX
-static int ext4_dax_huge_fault(struct vm_fault *vmf,
+static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
- int result, error = 0;
+ int error = 0;
+ vm_fault_t result;
int retries = 0;
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -335,7 +336,7 @@ retry:
return result;
}
-static int ext4_dax_fault(struct vm_fault *vmf)
+static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
@@ -380,50 +381,64 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_sample_last_mounted(struct super_block *sb,
+ struct vfsmount *mnt)
{
- struct super_block *sb = inode->i_sb;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct vfsmount *mnt = filp->f_path.mnt;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct path path;
char buf[64], *cp;
+ handle_t *handle;
+ int err;
+
+ if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
+ return 0;
+
+ if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
+ return 0;
+
+ sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ /*
+ * Sample where the filesystem has been mounted and
+ * store it in the superblock for sysadmin convenience
+ * when trying to sort through large numbers of block
+ * devices or filesystem images.
+ */
+ memset(buf, 0, sizeof(buf));
+ path.mnt = mnt;
+ path.dentry = mnt->mnt_root;
+ cp = d_path(&path, buf, sizeof(buf));
+ err = 0;
+ if (IS_ERR(cp))
+ goto out;
+
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ err = PTR_ERR(handle);
+ if (IS_ERR(handle))
+ goto out;
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto out_journal;
+ strlcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
+ ext4_handle_dirty_super(handle, sb);
+out_journal:
+ ext4_journal_stop(handle);
+out:
+ sb_end_intwrite(sb);
+ return err;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
int ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
- !sb_rdonly(sb))) {
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
- /*
- * Sample where the filesystem has been mounted and
- * store it in the superblock for sysadmin convenience
- * when trying to sort through large numbers of block
- * devices or filesystem images.
- */
- memset(buf, 0, sizeof(buf));
- path.mnt = mnt;
- path.dentry = mnt->mnt_root;
- cp = d_path(&path, buf, sizeof(buf));
- if (!IS_ERR(cp)) {
- handle_t *handle;
- int err;
-
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err) {
- ext4_journal_stop(handle);
- return err;
- }
- strlcpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
- ext4_handle_dirty_super(handle, sb);
- ext4_journal_stop(handle);
- }
- }
+ ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
+ if (ret)
+ return ret;
ret = fscrypt_file_open(inode, filp);
if (ret)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index e871c4bf18e9..4b99e2db95b8 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -402,8 +402,8 @@ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list)
}
/* Find all the fixed metadata in the filesystem. */
-int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
- struct list_head *meta_list)
+static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
+ struct list_head *meta_list)
{
struct ext4_group_desc *gdp;
ext4_group_t agno;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df92e3ec9913..4d6e007f3569 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -83,7 +83,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
- struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return 0;
@@ -97,14 +96,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, blk);
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, desc);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return -EFSBADCRC;
}
set_buffer_verified(bh);
@@ -136,6 +129,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
(bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
ext4_error(sb, "Invalid inode bitmap blk %llu in "
"block_group %u", bitmap_blk, block_group);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EFSCORRUPTED);
}
bh = sb_getblk(sb, bitmap_blk);
@@ -143,7 +138,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-ENOMEM);
}
if (bitmap_uptodate(bh))
goto verify;
@@ -190,6 +185,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EIO);
}
@@ -337,13 +334,8 @@ out:
fatal = err;
} else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
- if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, gdp);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
}
error_return:
@@ -914,6 +906,8 @@ repeat_in_this_group:
if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
"inode=%lu", ino + 1);
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto next_group;
}
@@ -1105,6 +1099,8 @@ got:
err = -EIO;
ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
inode->i_ino);
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
inode->i_generation = prandom_u32();
@@ -1206,11 +1202,8 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- ext4_error(sb, "inode bitmap error %ld for orphan %lu",
- ino, PTR_ERR(bitmap_bh));
+ if (IS_ERR(bitmap_bh))
return (struct inode *) bitmap_bh;
- }
/* Having the inode bit set should be a 100% indicator that this
* is a valid orphan (no e2fsck run on fs). Orphans also include
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index c32802c956d5..bf7fa1507e81 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -561,10 +561,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
int i;
- /* Count number blocks in a subtree under 'partial' */
- count = 1;
- for (i = 0; partial + i != chain + depth - 1; i++)
- count *= epb;
+ /*
+ * Count number blocks in a subtree under 'partial'. At each
+ * level we count number of complete empty subtrees beyond
+ * current offset and then descend into the subtree only
+ * partially beyond current offset.
+ */
+ count = 0;
+ for (i = partial - chain + 1; i < depth; i++)
+ count = count * epb + (epb - offsets[i] - 1);
+ count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
map->m_len = min_t(unsigned int, map->m_len, count);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 70cf4c7b268a..285ed1588730 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -144,6 +144,12 @@ int ext4_find_inline_data_nolock(struct inode *inode)
goto out;
if (!is.s.not_found) {
+ if (is.s.here->e_value_inum) {
+ EXT4_ERROR_INODE(inode, "inline data xattr refers "
+ "to an external xattr inode");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
(void *)ext4_raw_inode(&is.iloc));
EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
@@ -1835,8 +1841,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
iomap->offset = 0;
iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
i_size_read(inode));
- iomap->type = 0;
- iomap->flags = IOMAP_F_DATA_INLINE;
+ iomap->type = IOMAP_INLINE;
+ iomap->flags = 0;
out:
up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18aa2ef963ad..2ea07efbe016 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2716,12 +2716,6 @@ static int ext4_writepages(struct address_space *mapping,
percpu_down_read(&sbi->s_journal_flag_rwsem);
trace_ext4_writepages(inode, wbc);
- if (dax_mapping(mapping)) {
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
- wbc);
- goto out_writepages;
- }
-
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
@@ -2942,6 +2936,27 @@ out_writepages:
return ret;
}
+static int ext4_dax_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int ret;
+ long nr_to_write = wbc->nr_to_write;
+ struct inode *inode = mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+
+ percpu_down_read(&sbi->s_journal_flag_rwsem);
+ trace_ext4_writepages(inode, wbc);
+
+ ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
+ percpu_up_read(&sbi->s_journal_flag_rwsem);
+ return ret;
+}
+
static int ext4_nonda_switch(struct super_block *sb)
{
s64 free_clusters, dirty_clusters;
@@ -3845,10 +3860,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ext4_has_inline_data(inode))
return 0;
- /* DAX uses iomap path now */
- if (WARN_ON_ONCE(IS_DAX(inode)))
- return 0;
-
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (iov_iter_rw(iter) == READ)
ret = ext4_direct_IO_read(iocb, iter);
@@ -3934,6 +3945,13 @@ static const struct address_space_operations ext4_da_aops = {
.error_remove_page = generic_error_remove_page,
};
+static const struct address_space_operations ext4_dax_aops = {
+ .writepages = ext4_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .set_page_dirty = noop_set_page_dirty,
+ .invalidatepage = noop_invalidatepage,
+};
+
void ext4_set_aops(struct inode *inode)
{
switch (ext4_inode_journal_mode(inode)) {
@@ -3946,7 +3964,9 @@ void ext4_set_aops(struct inode *inode)
default:
BUG();
}
- if (test_opt(inode->i_sb, DELALLOC))
+ if (IS_DAX(inode))
+ inode->i_mapping->a_ops = &ext4_dax_aops;
+ else if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
else
inode->i_mapping->a_ops = &ext4_aops;
@@ -4278,28 +4298,28 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
EXT4_BLOCK_SIZE_BITS(sb);
stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- /* If there are no blocks to remove, return now */
- if (first_block >= stop_block)
- goto out_stop;
+ /* If there are blocks to remove, do it */
+ if (stop_block > first_block) {
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
- ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
- else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_ind_remove_space(handle, inode, first_block,
+ stop_block);
- up_write(&EXT4_I(inode)->i_data_sem);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -4681,19 +4701,21 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
-static inline void ext4_iget_extra_inode(struct inode *inode,
+static inline int ext4_iget_extra_inode(struct inode *inode,
struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
{
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
EXT4_INODE_SIZE(inode->i_sb) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- ext4_find_inline_data_nolock(inode);
+ return ext4_find_inline_data_nolock(inode);
} else
EXT4_I(inode)->i_inline_off = 0;
+ return 0;
}
int ext4_get_projid(struct inode *inode, kprojid_t *projid)
@@ -4704,6 +4726,26 @@ int ext4_get_projid(struct inode *inode, kprojid_t *projid)
return 0;
}
+/*
+ * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
+ * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
+ * set.
+ */
+static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ inode_set_iversion_raw(inode, val);
+ else
+ inode_set_iversion_queried(inode, val);
+}
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return inode_peek_iversion_raw(inode);
+ else
+ return inode_peek_iversion(inode);
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -4873,7 +4915,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- ext4_iget_extra_inode(inode, raw_inode, ei);
+ ret = ext4_iget_extra_inode(inode, raw_inode, ei);
+ if (ret)
+ goto bad_inode;
}
}
@@ -4890,7 +4934,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ivers |=
(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
}
- inode_set_iversion_queried(inode, ivers);
+ ext4_inode_set_iversion_queried(inode, ivers);
}
ret = 0;
@@ -4925,6 +4969,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
+ /* VFS does not allow setting these so must be corruption */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ EXT4_ERROR_INODE(inode,
+ "immutable or append flags not allowed on symlinks");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
if (ext4_encrypted_inode(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
ext4_set_aops(inode);
@@ -5024,12 +5075,12 @@ static int other_inode_match(struct inode * inode, unsigned long ino,
if ((inode->i_ino != ino) ||
(inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ I_DIRTY_INODE)) ||
((inode->i_state & I_DIRTY_TIME) == 0))
return 0;
spin_lock(&inode->i_lock);
if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+ I_DIRTY_INODE)) == 0) &&
(inode->i_state & I_DIRTY_TIME)) {
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -5176,7 +5227,7 @@ static int ext4_do_update_inode(handle_t *handle,
}
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- u64 ivers = inode_peek_iversion(inode);
+ u64 ivers = ext4_inode_peek_iversion(inode);
raw_inode->i_disk_version = cpu_to_le32(ivers);
if (ei->i_extra_isize) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 769a62708b1c..6eae2b91aafa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -470,6 +470,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
"freeing block already freed "
"(bit %u)",
first + i);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -747,10 +749,8 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_set_largest_free_order(sb, grp);
@@ -1454,12 +1454,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
"freeing already freed block "
"(bit %u); block bitmap corrupt.",
block);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- e4b->bd_info->bb_free);
- /* Mark the block group as corrupt. */
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
- &e4b->bd_info->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
mb_regenerate_buddy(e4b);
goto done;
}
@@ -1956,6 +1952,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
"%d free clusters as per "
"group info. But bitmap says 0",
free);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
@@ -1966,6 +1964,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
"%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
/*
* The number of free blocks differs. This mostly
* indicate that the bitmap is corrupt. So exit
@@ -2254,7 +2254,7 @@ out:
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
- struct super_block *sb = seq->private;
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
ext4_group_t group;
if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
@@ -2265,7 +2265,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct super_block *sb = seq->private;
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
ext4_group_t group;
++*pos;
@@ -2277,7 +2277,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
- struct super_block *sb = seq->private;
+ struct super_block *sb = PDE_DATA(file_inode(seq->file));
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
int err, buddy_loaded = 0;
@@ -2330,34 +2330,13 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}
-static const struct seq_operations ext4_mb_seq_groups_ops = {
+const struct seq_operations ext4_mb_seq_groups_ops = {
.start = ext4_mb_seq_groups_start,
.next = ext4_mb_seq_groups_next,
.stop = ext4_mb_seq_groups_stop,
.show = ext4_mb_seq_groups_show,
};
-static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
-{
- struct super_block *sb = PDE_DATA(inode);
- int rc;
-
- rc = seq_open(file, &ext4_mb_seq_groups_ops);
- if (rc == 0) {
- struct seq_file *m = file->private_data;
- m->private = sb;
- }
- return rc;
-
-}
-
-const struct file_operations ext4_seq_mb_groups_fops = {
- .open = ext4_mb_seq_groups_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
@@ -2537,8 +2516,7 @@ static void ext4_groupinfo_destroy_slabs(void)
int i;
for (i = 0; i < NR_GRPINFO_CACHES; i++) {
- if (ext4_groupinfo_caches[i])
- kmem_cache_destroy(ext4_groupinfo_caches[i]);
+ kmem_cache_destroy(ext4_groupinfo_caches[i]);
ext4_groupinfo_caches[i] = NULL;
}
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b1f21e3a0763..4a09063ce1d2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2411,8 +2411,7 @@ static int ext4_add_nondir(handle_t *handle,
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
}
drop_nlink(inode);
@@ -2651,8 +2650,7 @@ out_clear_inode:
err = ext4_mark_inode_dirty(handle, dir);
if (err)
goto out_clear_inode;
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6bec270a8e4..d792b7689d92 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1933,7 +1933,7 @@ retry:
return 0;
n_group = ext4_get_group_number(sb, n_blocks_count - 1);
- if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+ if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
ext4_warning(sb, "resize would cause inodes_count overflow");
return -EINVAL;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 185f7e61f4cf..00fe75a71c4b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -763,6 +763,36 @@ __acquires(bitlock)
return;
}
+void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int flags)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+ if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) &&
+ !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) {
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &grp->bb_state);
+ }
+
+ if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) &&
+ !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ if (gdp) {
+ int count;
+
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
+ &grp->bb_state);
+ }
+}
+
void ext4_update_dynamic_rev(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -1237,19 +1267,13 @@ static bool ext4_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
}
-static unsigned ext4_max_namelen(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
- EXT4_NAME_LEN;
-}
-
static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
.dummy_context = ext4_dummy_context,
.empty_dir = ext4_empty_dir,
- .max_namelen = ext4_max_namelen,
+ .max_namelen = EXT4_NAME_LEN,
};
#endif
@@ -2116,12 +2140,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int res = 0;
+ int err = 0;
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
- res = SB_RDONLY;
+ err = -EROFS;
}
if (read_only)
goto done;
@@ -2154,7 +2178,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
- ext4_commit_super(sb, 1);
+ err = ext4_commit_super(sb, 1);
done:
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
@@ -2166,7 +2190,7 @@ done:
sbi->s_mount_opt, sbi->s_mount_opt2);
cleancache_init_fs(sb);
- return res;
+ return err;
}
int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
@@ -3732,8 +3756,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
" that may contain inline data");
sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
}
- err = bdev_dax_supported(sb, blocksize);
- if (err) {
+ if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
@@ -4224,8 +4247,12 @@ no_journal:
goto failed_mount4;
}
- if (ext4_setup_super(sb, es, sb_rdonly(sb)))
+ ret = ext4_setup_super(sb, es, sb_rdonly(sb));
+ if (ret == -EROFS) {
sb->s_flags |= SB_RDONLY;
+ ret = 0;
+ } else if (ret)
+ goto failed_mount4a;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -4760,11 +4787,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
unlock_buffer(sbh);
error = __sync_dirty_buffer(sbh,
REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
- if (error)
- return error;
-
- error = buffer_write_io_error(sbh);
- if (error) {
+ if (buffer_write_io_error(sbh)) {
ext4_msg(sb, KERN_ERR, "I/O error while writing "
"superblock");
clear_buffer_write_io_error(sbh);
@@ -5165,8 +5188,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
- if (!ext4_setup_super(sb, es, 0))
- sb->s_flags &= ~SB_RDONLY;
+
+ err = ext4_setup_super(sb, es, 0);
+ if (err)
+ goto restore_opts;
+
+ sb->s_flags &= ~SB_RDONLY;
if (ext4_has_feature_mmp(sb))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
@@ -5190,8 +5217,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
- if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY))
- ext4_commit_super(sb, 1);
+ if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
+ err = ext4_commit_super(sb, 1);
+ if (err)
+ goto restore_opts;
+ }
#ifdef CONFIG_QUOTA
/* Release old quota file names */
@@ -5252,7 +5282,8 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bsoftlimit :
dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
- curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ curblock = (dquot->dq_dqb.dqb_curspace +
+ dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
buf->f_blocks = limit;
buf->f_bfree = buf->f_bavail =
(buf->f_blocks > curblock) ?
@@ -5886,5 +5917,6 @@ static void __exit ext4_exit_fs(void)
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32c");
module_init(ext4_init_fs)
module_exit(ext4_exit_fs)
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9ebd26c957c2..f34da0bb8f17 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -346,39 +346,9 @@ static struct kobject *ext4_root;
static struct kobject *ext4_feat;
-#define PROC_FILE_SHOW_DEFN(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
-} \
-\
-static const struct file_operations ext4_seq_##name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-}
-
-#define PROC_FILE_LIST(name) \
- { __stringify(name), &ext4_seq_##name##_fops }
-
-PROC_FILE_SHOW_DEFN(es_shrinker_info);
-PROC_FILE_SHOW_DEFN(options);
-
-static const struct ext4_proc_files {
- const char *name;
- const struct file_operations *fops;
-} proc_files[] = {
- PROC_FILE_LIST(options),
- PROC_FILE_LIST(es_shrinker_info),
- PROC_FILE_LIST(mb_groups),
- { NULL, NULL },
-};
-
int ext4_register_sysfs(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- const struct ext4_proc_files *p;
int err;
init_completion(&sbi->s_kobj_unregister);
@@ -392,11 +362,14 @@ int ext4_register_sysfs(struct super_block *sb)
if (ext4_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
if (sbi->s_proc) {
- for (p = proc_files; p->name; p++)
- proc_create_data(p->name, S_IRUGO, sbi->s_proc,
- p->fops, sb);
+ proc_create_single_data("options", S_IRUGO, sbi->s_proc,
+ ext4_seq_options_show, sb);
+ proc_create_single_data("es_shrinker_info", S_IRUGO,
+ sbi->s_proc, ext4_seq_es_shrinker_info_show,
+ sb);
+ proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_ops, sb);
}
return 0;
}
@@ -404,13 +377,9 @@ int ext4_register_sysfs(struct super_block *sb)
void ext4_unregister_sysfs(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- const struct ext4_proc_files *p;
- if (sbi->s_proc) {
- for (p = proc_files; p->name; p++)
- remove_proc_entry(p->name, sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
+ if (sbi->s_proc)
+ remove_proc_subtree(sb->s_id, ext4_proc_root);
kobject_del(&sbi->s_kobj);
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 499cb4b1fbd2..fc4ced59c565 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1688,7 +1688,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
/* No failures allowed past this point. */
- if (!s->not_found && here->e_value_offs) {
+ if (!s->not_found && here->e_value_size && here->e_value_offs) {
/* Remove the old value. */
void *first_val = s->base + min_offs;
size_t offs = le16_to_cpu(here->e_value_offs);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 629001b28632..197a9d8a15ef 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -43,7 +43,7 @@ ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
err = ext4_xattr_set_handle(handle, inode,
EXT4_XATTR_INDEX_SECURITY,
xattr->name, xattr->value,
- xattr->value_len, 0);
+ xattr->value_len, XATTR_CREATE);
if (err < 0)
break;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index db50686f5096..02237d4d91f5 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page)
SetPageDirty(page);
spin_unlock(&mapping->private_lock);
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ xa_lock_irqsave(&mapping->i_pages, flags);
WARN_ON_ONCE(!PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
+ radix_tree_tag_set(&mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index fe661274ff10..8c9c2f31b253 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!truncate_hole(dir, page->index, page->index + 1)) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index bfb7a4a3a929..9327411fd93b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
- .iroot = RADIX_TREE_INIT(GFP_NOFS),
+ .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
trace_f2fs_gc_begin(sbi->sb, sync, background,
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3b77d6421218..265da200daa8 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ radix_tree_tag_clear(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d5098efe577c..75e37fd720b2 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -294,8 +294,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
alloc_nid_done(sbi, ino);
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
@@ -597,8 +596,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
err = page_symlink(inode, disk_link.name, disk_link.len);
err_out:
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
/*
* Let's flush symlink data in order to avoid broken symlink as much as
@@ -661,8 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
alloc_nid_done(sbi, inode->i_ino);
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
@@ -713,8 +710,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
alloc_nid_done(sbi, inode->i_ino);
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9a99243054ba..f202398e20ea 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page)
unsigned int long flags;
if (PageDirty(page)) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
- radix_tree_tag_clear(&mapping->page_tree,
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ radix_tree_tag_clear(&mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
@@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_bug_on(sbi, check_nid_range(sbi, nid));
rcu_read_lock();
- apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
+ apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
rcu_read_unlock();
if (apage)
return;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 42d564c5ccd0..970ae27f401c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1930,19 +1930,13 @@ static bool f2fs_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
}
-static unsigned f2fs_max_namelen(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ?
- inode->i_sb->s_blocksize : F2FS_NAME_LEN;
-}
-
static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
.dummy_context = f2fs_dummy_context,
.empty_dir = f2fs_empty_dir,
- .max_namelen = f2fs_max_namelen,
+ .max_namelen = F2FS_NAME_LEN,
};
#endif
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index f33a56d6e6dd..4b47ca6296a7 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -572,23 +572,6 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset)
return 0;
}
-#define F2FS_PROC_FILE_DEF(_name) \
-static int _name##_open_fs(struct inode *inode, struct file *file) \
-{ \
- return single_open(file, _name##_seq_show, PDE_DATA(inode)); \
-} \
- \
-static const struct file_operations f2fs_seq_##_name##_fops = { \
- .open = _name##_open_fs, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
-F2FS_PROC_FILE_DEF(segment_info);
-F2FS_PROC_FILE_DEF(segment_bits);
-F2FS_PROC_FILE_DEF(iostat_info);
-
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -632,12 +615,12 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
if (sbi->s_proc) {
- proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
- &f2fs_seq_segment_info_fops, sb);
- proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
- &f2fs_seq_segment_bits_fops, sb);
- proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
- &f2fs_seq_iostat_info_fops, sb);
+ proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc,
+ segment_info_seq_show, sb);
+ proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc,
+ segment_bits_seq_show, sb);
+ proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc,
+ iostat_info_seq_show, sb);
}
return 0;
}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 582ca731a6c9..484ce674e0cd 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -314,10 +314,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
int err;
mutex_lock(&MSDOS_SB(sb)->s_lock);
- /*
- * Check whether the directory is not in use, then check
- * whether it is empty.
- */
err = fat_dir_empty(inode);
if (err)
goto out;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2649759c478a..4f4362d5a04c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -697,15 +697,6 @@ static int vfat_find(struct inode *dir, const struct qstr *qname,
return fat_search_long(dir, qname->name, len, sinfo);
}
-/*
- * (nfsd's) anonymous disconnected dentry?
- * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
- */
-static int vfat_d_anon_disconn(struct dentry *dentry)
-{
- return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
-}
-
static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -738,8 +729,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
* Checking "alias->d_parent == dentry->d_parent" to make sure
* FS is not corrupted (especially double linked dir).
*/
- if (alias && alias->d_parent == dentry->d_parent &&
- !vfat_d_anon_disconn(alias)) {
+ if (alias && alias->d_parent == dentry->d_parent) {
/*
* This inode has non anonymous-DCACHE_DISCONNECTED
* dentry. This means, the user did ->lookup() by an
@@ -747,7 +737,6 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
*
* Switch to new one for reason of locality if possible.
*/
- BUG_ON(d_unhashed(alias));
if (!S_ISDIR(inode->i_mode))
d_move(alias, dentry);
iput(inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d737ff082472..c42169459298 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -871,9 +871,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
if (fa->fa_file != filp)
continue;
- spin_lock_irq(&fa->fa_lock);
+ write_lock_irq(&fa->fa_lock);
fa->fa_file = NULL;
- spin_unlock_irq(&fa->fa_lock);
+ write_unlock_irq(&fa->fa_lock);
*fp = fa->fa_next;
call_rcu(&fa->fa_rcu, fasync_free_rcu);
@@ -918,13 +918,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
if (fa->fa_file != filp)
continue;
- spin_lock_irq(&fa->fa_lock);
+ write_lock_irq(&fa->fa_lock);
fa->fa_fd = fd;
- spin_unlock_irq(&fa->fa_lock);
+ write_unlock_irq(&fa->fa_lock);
goto out;
}
- spin_lock_init(&new->fa_lock);
+ rwlock_init(&new->fa_lock);
new->magic = FASYNC_MAGIC;
new->fa_file = filp;
new->fa_fd = fd;
@@ -987,14 +987,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
- unsigned long flags;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- spin_lock_irqsave(&fa->fa_lock, flags);
+ read_lock(&fa->fa_lock);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
@@ -1003,7 +1002,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
- spin_unlock_irqrestore(&fa->fa_lock, flags);
+ read_unlock(&fa->fa_lock);
fa = rcu_dereference(fa->fa_next);
}
}
diff --git a/fs/filesystems.c b/fs/filesystems.c
index f2728a4a03a1..b03f57b1105b 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -238,21 +238,9 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int filesystems_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, filesystems_proc_show, NULL);
-}
-
-static const struct file_operations filesystems_proc_fops = {
- .open = filesystems_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_filesystems_init(void)
{
- proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+ proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
return 0;
}
module_init(proc_filesystems_init);
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index ce4785fd81c6..a51425634f65 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -193,13 +193,9 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
return ERR_PTR(-ENAMETOOLONG);
ino = vxfs_inode_by_name(dip, dp);
- if (ino) {
+ if (ino)
ip = vxfs_iget(dip->i_sb, ino);
- if (IS_ERR(ip))
- return ERR_CAST(ip);
- }
- d_add(dp, ip);
- return NULL;
+ return d_splice_alias(ip, dp);
}
/**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d4d04fee568a..471d863958bc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
- * synchronizing against mapping->tree_lock.
+ * synchronizing against the i_pages lock.
*
- * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+ * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
@@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
spin_lock(&inode->i_lock);
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
@@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
- * pages actually under underwriteback.
+ * pages actually under writeback.
*/
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
PAGECACHE_TAG_DIRTY) {
struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (likely(page) && PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
PAGECACHE_TAG_WRITEBACK) {
struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->tree_lock);
+ &mapping->i_pages.xa_lock);
if (likely(page)) {
WARN_ON_ONCE(!PageWriteback(page));
dec_wb_stat(old_wb, WB_WRITEBACK);
@@ -430,7 +430,7 @@ skip_switch:
*/
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
@@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the mapping's
- * tree_lock so that stat transfer can synchronize against them.
+ * the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
@@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits)
*/
if (inode && inode_to_wb_is_valid(inode)) {
struct bdi_writeback *wb;
- bool locked, congested;
+ struct wb_lock_cookie lock_cookie = {};
+ bool congested;
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
congested = wb_congested(wb, cong_bits);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &lock_cookie);
return congested;
}
@@ -1343,7 +1344,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
dirty = inode->i_state & I_DIRTY;
if (inode->i_state & I_DIRTY_TIME) {
- if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ if ((dirty & I_DIRTY_INODE) ||
wbc->sync_mode == WB_SYNC_ALL ||
unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
unlikely(time_after(jiffies,
@@ -1960,7 +1961,7 @@ void wb_workfn(struct work_struct *work)
}
if (!list_empty(&wb->work_list))
- mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ wb_wakeup(wb);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
@@ -2112,7 +2113,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
*/
void __mark_inode_dirty(struct inode *inode, int flags)
{
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
struct super_block *sb = inode->i_sb;
int dirtytime;
@@ -2122,7 +2122,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
+ if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
@@ -2197,7 +2197,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (dirtytime)
inode->dirtied_time_when = jiffies;
- if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+ if (inode->i_state & I_DIRTY)
dirty_list = &wb->b_dirty;
else
dirty_list = &wb->b_dirty_time;
@@ -2221,8 +2221,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
out_unlock_inode:
spin_unlock(&inode->i_lock);
-
-#undef I_DIRTY_INODE
}
EXPORT_SYMBOL(__mark_inode_dirty);
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 56cce7fdd39e..c184c5a356ff 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -125,7 +125,7 @@ struct fscache_cache *fscache_select_cache_for_object(
}
/* the parent is unbacked */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
/* cookie not an index and is unbacked */
spin_unlock(&cookie->lock);
_leave(" = NULL [cookie ub,ni]");
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d705125665f0..97137d7ec5ee 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -21,12 +21,54 @@ struct kmem_cache *fscache_cookie_jar;
static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
-static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+#define fscache_cookie_hash_shift 15
+static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift];
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
+ loff_t object_size);
static int fscache_alloc_object(struct fscache_cache *cache,
struct fscache_cookie *cookie);
static int fscache_attach_object(struct fscache_cookie *cookie,
struct fscache_object *object);
+static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
+{
+ struct hlist_node *object;
+ const u8 *k;
+ unsigned loop;
+
+ pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n",
+ prefix, cookie, cookie->parent, cookie->flags,
+ atomic_read(&cookie->n_children),
+ atomic_read(&cookie->n_active));
+ pr_err("%c-cookie d=%p n=%p\n",
+ prefix, cookie->def, cookie->netfs_data);
+
+ object = READ_ONCE(cookie->backing_objects.first);
+ if (object)
+ pr_err("%c-cookie o=%p\n",
+ prefix, hlist_entry(object, struct fscache_object, cookie_link));
+
+ pr_err("%c-key=[%u] '", prefix, cookie->key_len);
+ k = (cookie->key_len <= sizeof(cookie->inline_key)) ?
+ cookie->inline_key : cookie->key;
+ for (loop = 0; loop < cookie->key_len; loop++)
+ pr_cont("%02x", k[loop]);
+ pr_cont("'\n");
+}
+
+void fscache_free_cookie(struct fscache_cookie *cookie)
+{
+ if (cookie) {
+ BUG_ON(!hlist_empty(&cookie->backing_objects));
+ if (cookie->aux_len > sizeof(cookie->inline_aux))
+ kfree(cookie->aux);
+ if (cookie->key_len > sizeof(cookie->inline_key))
+ kfree(cookie->key);
+ kmem_cache_free(fscache_cookie_jar, cookie);
+ }
+}
+
/*
* initialise an cookie jar slab element prior to any use
*/
@@ -41,6 +83,170 @@ void fscache_cookie_init_once(void *_cookie)
}
/*
+ * Set the index key in a cookie. The cookie struct has space for a 12-byte
+ * key plus length and hash, but if that's not big enough, it's instead a
+ * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then
+ * the key data.
+ */
+static int fscache_set_key(struct fscache_cookie *cookie,
+ const void *index_key, size_t index_key_len)
+{
+ unsigned long long h;
+ u32 *buf;
+ int i;
+
+ cookie->key_len = index_key_len;
+
+ if (index_key_len > sizeof(cookie->inline_key)) {
+ buf = kzalloc(index_key_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ cookie->key = buf;
+ } else {
+ buf = (u32 *)cookie->inline_key;
+ buf[0] = 0;
+ buf[1] = 0;
+ buf[2] = 0;
+ }
+
+ memcpy(buf, index_key, index_key_len);
+
+ /* Calculate a hash and combine this with the length in the first word
+ * or first half word
+ */
+ h = (unsigned long)cookie->parent;
+ h += index_key_len + cookie->type;
+ for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++)
+ h += buf[i];
+
+ cookie->key_hash = h ^ (h >> 32);
+ return 0;
+}
+
+static long fscache_compare_cookie(const struct fscache_cookie *a,
+ const struct fscache_cookie *b)
+{
+ const void *ka, *kb;
+
+ if (a->key_hash != b->key_hash)
+ return (long)a->key_hash - (long)b->key_hash;
+ if (a->parent != b->parent)
+ return (long)a->parent - (long)b->parent;
+ if (a->key_len != b->key_len)
+ return (long)a->key_len - (long)b->key_len;
+ if (a->type != b->type)
+ return (long)a->type - (long)b->type;
+
+ if (a->key_len <= sizeof(a->inline_key)) {
+ ka = &a->inline_key;
+ kb = &b->inline_key;
+ } else {
+ ka = a->key;
+ kb = b->key;
+ }
+ return memcmp(ka, kb, a->key_len);
+}
+
+/*
+ * Allocate a cookie.
+ */
+struct fscache_cookie *fscache_alloc_cookie(
+ struct fscache_cookie *parent,
+ const struct fscache_cookie_def *def,
+ const void *index_key, size_t index_key_len,
+ const void *aux_data, size_t aux_data_len,
+ void *netfs_data,
+ loff_t object_size)
+{
+ struct fscache_cookie *cookie;
+
+ /* allocate and initialise a cookie */
+ cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+ if (!cookie)
+ return NULL;
+
+ cookie->key_len = index_key_len;
+ cookie->aux_len = aux_data_len;
+
+ if (fscache_set_key(cookie, index_key, index_key_len) < 0)
+ goto nomem;
+
+ if (cookie->aux_len <= sizeof(cookie->inline_aux)) {
+ memcpy(cookie->inline_aux, aux_data, cookie->aux_len);
+ } else {
+ cookie->aux = kmemdup(aux_data, cookie->aux_len, GFP_KERNEL);
+ if (!cookie->aux)
+ goto nomem;
+ }
+
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+
+ /* We keep the active count elevated until relinquishment to prevent an
+ * attempt to wake up every time the object operations queue quiesces.
+ */
+ atomic_set(&cookie->n_active, 1);
+
+ cookie->def = def;
+ cookie->parent = parent;
+ cookie->netfs_data = netfs_data;
+ cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
+ cookie->type = def->type;
+
+ /* radix tree insertion won't use the preallocation pool unless it's
+ * told it may not wait */
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ return cookie;
+
+nomem:
+ fscache_free_cookie(cookie);
+ return NULL;
+}
+
+/*
+ * Attempt to insert the new cookie into the hash. If there's a collision, we
+ * return the old cookie if it's not in use and an error otherwise.
+ */
+struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate)
+{
+ struct fscache_cookie *cursor;
+ struct hlist_bl_head *h;
+ struct hlist_bl_node *p;
+ unsigned int bucket;
+
+ bucket = candidate->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1);
+ h = &fscache_cookie_hash[bucket];
+
+ hlist_bl_lock(h);
+ hlist_bl_for_each_entry(cursor, p, h, hash_link) {
+ if (fscache_compare_cookie(candidate, cursor) == 0)
+ goto collision;
+ }
+
+ __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags);
+ fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent);
+ atomic_inc(&candidate->parent->n_children);
+ hlist_bl_add_head(&candidate->hash_link, h);
+ hlist_bl_unlock(h);
+ return candidate;
+
+collision:
+ if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) {
+ trace_fscache_cookie(cursor, fscache_cookie_collision,
+ atomic_read(&cursor->usage));
+ pr_err("Duplicate cookie detected\n");
+ fscache_print_cookie(cursor, 'O');
+ fscache_print_cookie(candidate, 'N');
+ hlist_bl_unlock(h);
+ return NULL;
+ }
+
+ fscache_cookie_get(cursor, fscache_cookie_get_reacquire);
+ hlist_bl_unlock(h);
+ return cursor;
+}
+
+/*
* request a cookie to represent an object (index, datafile, xattr, etc)
* - parent specifies the parent object
* - the top level index cookie for each netfs is stored in the fscache_netfs
@@ -58,10 +264,13 @@ void fscache_cookie_init_once(void *_cookie)
struct fscache_cookie *__fscache_acquire_cookie(
struct fscache_cookie *parent,
const struct fscache_cookie_def *def,
+ const void *index_key, size_t index_key_len,
+ const void *aux_data, size_t aux_data_len,
void *netfs_data,
+ loff_t object_size,
bool enable)
{
- struct fscache_cookie *cookie;
+ struct fscache_cookie *candidate, *cookie;
BUG_ON(!def);
@@ -69,6 +278,13 @@ struct fscache_cookie *__fscache_acquire_cookie(
parent ? (char *) parent->def->name : "<no-parent>",
def->name, netfs_data, enable);
+ if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255)
+ return NULL;
+ if (!aux_data || !aux_data_len) {
+ aux_data = NULL;
+ aux_data_len = 0;
+ }
+
fscache_stat(&fscache_n_acquires);
/* if there's no parent cookie, then we don't create one here either */
@@ -79,41 +295,31 @@ struct fscache_cookie *__fscache_acquire_cookie(
}
/* validate the definition */
- BUG_ON(!def->get_key);
BUG_ON(!def->name[0]);
BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
- parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+ parent->type != FSCACHE_COOKIE_TYPE_INDEX);
- /* allocate and initialise a cookie */
- cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
- if (!cookie) {
+ candidate = fscache_alloc_cookie(parent, def,
+ index_key, index_key_len,
+ aux_data, aux_data_len,
+ netfs_data, object_size);
+ if (!candidate) {
fscache_stat(&fscache_n_acquires_oom);
_leave(" [ENOMEM]");
return NULL;
}
- atomic_set(&cookie->usage, 1);
- atomic_set(&cookie->n_children, 0);
-
- /* We keep the active count elevated until relinquishment to prevent an
- * attempt to wake up every time the object operations queue quiesces.
- */
- atomic_set(&cookie->n_active, 1);
-
- atomic_inc(&parent->usage);
- atomic_inc(&parent->n_children);
+ cookie = fscache_hash_cookie(candidate);
+ if (!cookie) {
+ trace_fscache_cookie(candidate, fscache_cookie_discard, 1);
+ goto out;
+ }
- cookie->def = def;
- cookie->parent = parent;
- cookie->netfs_data = netfs_data;
- cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
+ if (cookie == candidate)
+ candidate = NULL;
- /* radix tree insertion won't use the preallocation pool unless it's
- * told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-
- switch (cookie->def->type) {
+ switch (cookie->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
fscache_stat(&fscache_n_cookie_index);
break;
@@ -125,16 +331,19 @@ struct fscache_cookie *__fscache_acquire_cookie(
break;
}
+ trace_fscache_acquire(cookie);
+
if (enable) {
/* if the object is an index then we need do nothing more here
* - we create indices on disk when we need them as an index
* may exist in multiple caches */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
- if (fscache_acquire_non_index_cookie(cookie) == 0) {
+ if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) {
set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
} else {
atomic_dec(&parent->n_children);
- __fscache_cookie_put(cookie);
+ fscache_cookie_put(cookie,
+ fscache_cookie_put_acquire_nobufs);
fscache_stat(&fscache_n_acquires_nobufs);
_leave(" = NULL");
return NULL;
@@ -145,7 +354,9 @@ struct fscache_cookie *__fscache_acquire_cookie(
}
fscache_stat(&fscache_n_acquires_ok);
- _leave(" = %p", cookie);
+
+out:
+ fscache_free_cookie(candidate);
return cookie;
}
EXPORT_SYMBOL(__fscache_acquire_cookie);
@@ -154,24 +365,30 @@ EXPORT_SYMBOL(__fscache_acquire_cookie);
* Enable a cookie to permit it to accept new operations.
*/
void __fscache_enable_cookie(struct fscache_cookie *cookie,
+ const void *aux_data,
+ loff_t object_size,
bool (*can_enable)(void *data),
void *data)
{
_enter("%p", cookie);
+ trace_fscache_enable(cookie);
+
wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
TASK_UNINTERRUPTIBLE);
+ fscache_update_aux(cookie, aux_data);
+
if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
goto out_unlock;
if (can_enable && !can_enable(data)) {
/* The netfs decided it didn't want to enable after all */
- } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) {
/* Wait for outstanding disablement to complete */
__fscache_wait_on_invalidate(cookie);
- if (fscache_acquire_non_index_cookie(cookie) == 0)
+ if (fscache_acquire_non_index_cookie(cookie, object_size) == 0)
set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
} else {
set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
@@ -188,11 +405,11 @@ EXPORT_SYMBOL(__fscache_enable_cookie);
* - this must make sure the index chain is instantiated and instantiate the
* object representation too
*/
-static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie,
+ loff_t object_size)
{
struct fscache_object *object;
struct fscache_cache *cache;
- uint64_t i_size;
int ret;
_enter("");
@@ -231,9 +448,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
return ret;
}
- /* pass on how big the object we're caching is supposed to be */
- cookie->def->get_attr(cookie->netfs_data, &i_size);
-
spin_lock(&cookie->lock);
if (hlist_empty(&cookie->backing_objects)) {
spin_unlock(&cookie->lock);
@@ -243,7 +457,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
- fscache_set_store_limit(object, i_size);
+ fscache_set_store_limit(object, object_size);
/* initiate the process of looking up all the objects in the chain
* (done by fscache_initialise_object()) */
@@ -318,7 +532,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
* attached to the cookie */
if (fscache_attach_object(cookie, object) < 0) {
fscache_stat(&fscache_n_cop_put_object);
- cache->ops->put_object(object);
+ cache->ops->put_object(object, fscache_obj_put_attach_fail);
fscache_stat_d(&fscache_n_cop_put_object);
}
@@ -338,7 +552,7 @@ object_already_extant:
error_put:
fscache_stat(&fscache_n_cop_put_object);
- cache->ops->put_object(object);
+ cache->ops->put_object(object, fscache_obj_put_alloc_fail);
fscache_stat_d(&fscache_n_cop_put_object);
error:
_leave(" = %d", ret);
@@ -398,7 +612,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
/* attach to the cookie */
object->cookie = cookie;
- atomic_inc(&cookie->usage);
+ fscache_cookie_get(cookie, fscache_cookie_get_attach_object);
hlist_add_head(&object->cookie_link, &cookie->backing_objects);
fscache_objlist_add(object);
@@ -426,10 +640,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
* there, and if it's doing that, it may as well just retire the
* cookie.
*/
- ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
-
- /* We will be updating the cookie too. */
- BUG_ON(!cookie->def->get_aux);
+ ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
/* If there's an object, we tell the object state machine to handle the
* invalidation on our behalf, otherwise there's nothing to do.
@@ -473,7 +684,7 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate);
/*
* update the index entries backing a cookie
*/
-void __fscache_update_cookie(struct fscache_cookie *cookie)
+void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data)
{
struct fscache_object *object;
@@ -487,10 +698,10 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
_enter("{%s}", cookie->def->name);
- BUG_ON(!cookie->def->get_aux);
-
spin_lock(&cookie->lock);
+ fscache_update_aux(cookie, aux_data);
+
if (fscache_cookie_enabled(cookie)) {
/* update the index entry on disk in each cache backing this
* cookie.
@@ -509,13 +720,17 @@ EXPORT_SYMBOL(__fscache_update_cookie);
/*
* Disable a cookie to stop it from accepting new requests from the netfs.
*/
-void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
+void __fscache_disable_cookie(struct fscache_cookie *cookie,
+ const void *aux_data,
+ bool invalidate)
{
struct fscache_object *object;
bool awaken = false;
_enter("%p,%u", cookie, invalidate);
+ trace_fscache_disable(cookie);
+
ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
if (atomic_read(&cookie->n_children) != 0) {
@@ -526,6 +741,9 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
TASK_UNINTERRUPTIBLE);
+
+ fscache_update_aux(cookie, aux_data);
+
if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
goto out_unlock_enable;
@@ -563,7 +781,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
}
/* Make sure any pending writes are cancelled. */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+ if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX)
fscache_invalidate_writes(cookie);
/* Reset the cookie state if it wasn't relinquished */
@@ -585,7 +803,9 @@ EXPORT_SYMBOL(__fscache_disable_cookie);
* - all dependents of this cookie must have already been unregistered
* (indices/files/pages)
*/
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie,
+ const void *aux_data,
+ bool retire)
{
fscache_stat(&fscache_n_relinquishes);
if (retire)
@@ -601,15 +821,18 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
cookie, cookie->def->name, cookie->netfs_data,
atomic_read(&cookie->n_active), retire);
+ trace_fscache_relinquish(cookie, retire);
+
/* No further netfs-accessing operations on this cookie permitted */
- set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+ if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags))
+ BUG();
- __fscache_disable_cookie(cookie, retire);
+ __fscache_disable_cookie(cookie, aux_data, retire);
/* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
- BUG_ON(cookie->stores.rnode);
+ BUG_ON(!radix_tree_empty(&cookie->stores));
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -619,35 +842,54 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
/* Dispose of the netfs's link to the cookie */
ASSERTCMP(atomic_read(&cookie->usage), >, 0);
- fscache_cookie_put(cookie);
+ fscache_cookie_put(cookie, fscache_cookie_put_relinquish);
_leave("");
}
EXPORT_SYMBOL(__fscache_relinquish_cookie);
/*
- * destroy a cookie
+ * Remove a cookie from the hash table.
*/
-void __fscache_cookie_put(struct fscache_cookie *cookie)
+static void fscache_unhash_cookie(struct fscache_cookie *cookie)
+{
+ struct hlist_bl_head *h;
+ unsigned int bucket;
+
+ bucket = cookie->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1);
+ h = &fscache_cookie_hash[bucket];
+
+ hlist_bl_lock(h);
+ hlist_bl_del(&cookie->hash_link);
+ hlist_bl_unlock(h);
+}
+
+/*
+ * Drop a reference to a cookie.
+ */
+void fscache_cookie_put(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
{
struct fscache_cookie *parent;
+ int usage;
_enter("%p", cookie);
- for (;;) {
- _debug("FREE COOKIE %p", cookie);
- parent = cookie->parent;
- BUG_ON(!hlist_empty(&cookie->backing_objects));
- kmem_cache_free(fscache_cookie_jar, cookie);
+ do {
+ usage = atomic_dec_return(&cookie->usage);
+ trace_fscache_cookie(cookie, where, usage);
- if (!parent)
- break;
+ if (usage > 0)
+ return;
+ BUG_ON(usage < 0);
+
+ parent = cookie->parent;
+ fscache_unhash_cookie(cookie);
+ fscache_free_cookie(cookie);
cookie = parent;
- BUG_ON(atomic_read(&cookie->usage) <= 0);
- if (!atomic_dec_and_test(&cookie->usage))
- break;
- }
+ where = fscache_cookie_put_parent;
+ } while (cookie);
_leave("");
}
@@ -657,7 +899,8 @@ void __fscache_cookie_put(struct fscache_cookie *cookie)
*
* NOTE: it only serves no-index type
*/
-int __fscache_check_consistency(struct fscache_cookie *cookie)
+int __fscache_check_consistency(struct fscache_cookie *cookie,
+ const void *aux_data)
{
struct fscache_operation *op;
struct fscache_object *object;
@@ -666,7 +909,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
_enter("%p,", cookie);
- ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+ ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
@@ -678,13 +921,16 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
if (!op)
return -ENOMEM;
- fscache_operation_init(op, NULL, NULL, NULL);
+ fscache_operation_init(cookie, op, NULL, NULL, NULL);
op->flags = FSCACHE_OP_MYTHREAD |
(1 << FSCACHE_OP_WAITING) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
+ trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency);
spin_lock(&cookie->lock);
+ fscache_update_aux(cookie, aux_data);
+
if (!fscache_cookie_enabled(cookie) ||
hlist_empty(&cookie->backing_objects))
goto inconsistent;
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 5a117df2a9ef..aa46e48d8c75 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -13,16 +13,11 @@
#include <linux/module.h>
#include "internal.h"
-static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax);
-
-static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax);
-
static
enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen);
+ uint16_t datalen,
+ loff_t object_size);
/*
* The root index is owned by FS-Cache itself.
@@ -60,6 +55,7 @@ struct fscache_cookie fscache_fsdef_index = {
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
.flags = 1 << FSCACHE_COOKIE_ENABLED,
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
};
EXPORT_SYMBOL(fscache_fsdef_index);
@@ -71,59 +67,18 @@ EXPORT_SYMBOL(fscache_fsdef_index);
struct fscache_cookie_def fscache_fsdef_netfs_def = {
.name = "FSDEF.netfs",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = fscache_fsdef_netfs_get_key,
- .get_aux = fscache_fsdef_netfs_get_aux,
.check_aux = fscache_fsdef_netfs_check_aux,
};
/*
- * get the key data for an FSDEF index record - this is the name of the netfs
- * for which this entry is created
- */
-static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct fscache_netfs *netfs = cookie_netfs_data;
- unsigned klen;
-
- _enter("{%s.%u},", netfs->name, netfs->version);
-
- klen = strlen(netfs->name);
- if (klen > bufmax)
- return 0;
-
- memcpy(buffer, netfs->name, klen);
- return klen;
-}
-
-/*
- * get the auxiliary data for an FSDEF index record - this is the index
- * structure version number of the netfs for which this version is created
- */
-static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct fscache_netfs *netfs = cookie_netfs_data;
- unsigned dlen;
-
- _enter("{%s.%u},", netfs->name, netfs->version);
-
- dlen = sizeof(uint32_t);
- if (dlen > bufmax)
- return 0;
-
- memcpy(buffer, &netfs->version, dlen);
- return dlen;
-}
-
-/*
* check that the index structure version number stored in the auxiliary data
* matches the one the netfs gave us
*/
static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct fscache_netfs *netfs = cookie_netfs_data;
uint32_t version;
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index 15a3d042247e..9a13e9e15b69 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -83,24 +83,9 @@ static void fscache_histogram_stop(struct seq_file *m, void *v)
{
}
-static const struct seq_operations fscache_histogram_ops = {
+const struct seq_operations fscache_histogram_ops = {
.start = fscache_histogram_start,
.stop = fscache_histogram_stop,
.next = fscache_histogram_next,
.show = fscache_histogram_show,
};
-
-/*
- * open "/proc/fs/fscache/histogram" to provide latency data
- */
-static int fscache_histogram_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &fscache_histogram_ops);
-}
-
-const struct file_operations fscache_histogram_fops = {
- .open = fscache_histogram_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 0ff4b49a0037..f83328a7f048 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -29,7 +29,9 @@
#define pr_fmt(fmt) "FS-Cache: " fmt
#include <linux/fscache-cache.h>
+#include <trace/events/fscache.h>
#include <linux/sched.h>
+#include <linux/seq_file.h>
#define FSCACHE_MIN_THREADS 4
#define FSCACHE_MAX_THREADS 32
@@ -48,8 +50,16 @@ extern struct fscache_cache *fscache_select_cache_for_object(
*/
extern struct kmem_cache *fscache_cookie_jar;
+extern void fscache_free_cookie(struct fscache_cookie *);
extern void fscache_cookie_init_once(void *);
-extern void __fscache_cookie_put(struct fscache_cookie *);
+extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
+ const struct fscache_cookie_def *,
+ const void *, size_t,
+ const void *, size_t,
+ void *, loff_t);
+extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *);
+extern void fscache_cookie_put(struct fscache_cookie *,
+ enum fscache_cookie_trace);
/*
* fsdef.c
@@ -75,7 +85,7 @@ static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
atomic_inc(&histogram[jif]);
}
-extern const struct file_operations fscache_histogram_fops;
+extern const struct seq_operations fscache_histogram_ops;
#else
#define fscache_hist(hist, start_jif) do {} while (0)
@@ -285,7 +295,7 @@ static inline void fscache_stat_d(atomic_t *stat)
#define __fscache_stat(stat) (stat)
-extern const struct file_operations fscache_stats_fops;
+int fscache_stats_show(struct seq_file *m, void *v);
#else
#define __fscache_stat(stat) (NULL)
@@ -311,14 +321,12 @@ static inline void fscache_raise_event(struct fscache_object *object,
fscache_enqueue_object(object);
}
-/*
- * drop a reference to a cookie
- */
-static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+static inline void fscache_cookie_get(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
{
- BUG_ON(atomic_read(&cookie->usage) <= 0);
- if (atomic_dec_and_test(&cookie->usage))
- __fscache_cookie_put(cookie);
+ int usage = atomic_inc_return(&cookie->usage);
+
+ trace_fscache_cookie(cookie, where, usage);
}
/*
@@ -342,6 +350,27 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
cookie->def->put_context(cookie->netfs_data, context);
}
+/*
+ * Update the auxiliary data on a cookie.
+ */
+static inline
+void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data)
+{
+ void *p;
+
+ if (!aux_data)
+ return;
+ if (cookie->aux_len <= sizeof(cookie->inline_aux))
+ p = cookie->inline_aux;
+ else
+ p = cookie->aux;
+
+ if (memcmp(p, aux_data, cookie->aux_len) != 0) {
+ memcpy(p, aux_data, cookie->aux_len);
+ set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags);
+ }
+}
+
/*****************************************************************************/
/*
* debug tracing
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 249968dcbf5c..7dce110bf17d 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -16,6 +16,7 @@
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
+#define CREATE_TRACE_POINTS
#include "internal.h"
MODULE_DESCRIPTION("FS Cache Manager");
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index a8aa00be4444..c2f605483cc5 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -14,69 +14,51 @@
#include <linux/slab.h>
#include "internal.h"
-static LIST_HEAD(fscache_netfs_list);
-
/*
* register a network filesystem for caching
*/
int __fscache_register_netfs(struct fscache_netfs *netfs)
{
- struct fscache_netfs *ptr;
- struct fscache_cookie *cookie;
- int ret;
+ struct fscache_cookie *candidate, *cookie;
_enter("{%s}", netfs->name);
- INIT_LIST_HEAD(&netfs->link);
-
/* allocate a cookie for the primary index */
- cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
-
- if (!cookie) {
+ candidate = fscache_alloc_cookie(&fscache_fsdef_index,
+ &fscache_fsdef_netfs_def,
+ netfs->name, strlen(netfs->name),
+ &netfs->version, sizeof(netfs->version),
+ netfs, 0);
+ if (!candidate) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- /* initialise the primary index cookie */
- atomic_set(&cookie->usage, 1);
- atomic_set(&cookie->n_children, 0);
- atomic_set(&cookie->n_active, 1);
-
- cookie->def = &fscache_fsdef_netfs_def;
- cookie->parent = &fscache_fsdef_index;
- cookie->netfs_data = netfs;
- cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
-
- spin_lock_init(&cookie->lock);
- spin_lock_init(&cookie->stores_lock);
- INIT_HLIST_HEAD(&cookie->backing_objects);
+ candidate->flags = 1 << FSCACHE_COOKIE_ENABLED;
/* check the netfs type is not already present */
- down_write(&fscache_addremove_sem);
-
- ret = -EEXIST;
- list_for_each_entry(ptr, &fscache_netfs_list, link) {
- if (strcmp(ptr->name, netfs->name) == 0)
- goto already_registered;
+ cookie = fscache_hash_cookie(candidate);
+ if (!cookie)
+ goto already_registered;
+ if (cookie != candidate) {
+ trace_fscache_cookie(candidate, fscache_cookie_discard, 1);
+ fscache_free_cookie(candidate);
}
- atomic_inc(&cookie->parent->usage);
+ fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs);
atomic_inc(&cookie->parent->n_children);
netfs->primary_index = cookie;
- list_add(&netfs->link, &fscache_netfs_list);
- ret = 0;
pr_notice("Netfs '%s' registered for caching\n", netfs->name);
+ trace_fscache_netfs(netfs);
+ _leave(" = 0");
+ return 0;
already_registered:
- up_write(&fscache_addremove_sem);
-
- if (ret < 0)
- kmem_cache_free(fscache_cookie_jar, cookie);
-
- _leave(" = %d", ret);
- return ret;
+ fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs);
+ _leave(" = -EEXIST");
+ return -EEXIST;
}
EXPORT_SYMBOL(__fscache_register_netfs);
@@ -88,15 +70,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs)
{
_enter("{%s.%u}", netfs->name, netfs->version);
- down_write(&fscache_addremove_sem);
-
- list_del(&netfs->link);
- fscache_relinquish_cookie(netfs->primary_index, 0);
-
- up_write(&fscache_addremove_sem);
-
- pr_notice("Netfs '%s' unregistered from caching\n",
- netfs->name);
+ fscache_relinquish_cookie(netfs->primary_index, NULL, false);
+ pr_notice("Netfs '%s' unregistered from caching\n", netfs->name);
_leave("");
}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 0438d4cd91ef..43e6e28c164f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -36,8 +36,6 @@ struct fscache_objlist_data {
#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */
#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */
-
- u8 buf[512]; /* key and aux data buffer */
};
/*
@@ -170,7 +168,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
struct fscache_cookie *cookie;
unsigned long config = data->config;
char _type[3], *type;
- u8 *buf = data->buf, *p;
+ u8 *p;
if ((unsigned long) v == 1) {
seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
@@ -254,7 +252,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
if (fscache_use_cookie(obj)) {
uint16_t keylen = 0, auxlen = 0;
- switch (cookie->def->type) {
+ switch (cookie->type) {
case 0:
type = "IX";
break;
@@ -263,7 +261,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
break;
default:
snprintf(_type, sizeof(_type), "%02u",
- cookie->def->type);
+ cookie->type);
type = _type;
break;
}
@@ -274,30 +272,30 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
cookie->flags,
cookie->netfs_data);
- if (cookie->def->get_key &&
- config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = cookie->def->get_key(cookie->netfs_data,
- buf, 400);
+ if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = cookie->key_len;
- if (cookie->def->get_aux &&
- config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = cookie->def->get_aux(cookie->netfs_data,
- buf + keylen, 512 - keylen);
- fscache_unuse_cookie(obj);
+ if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = cookie->aux_len;
if (keylen > 0 || auxlen > 0) {
seq_puts(m, " ");
- for (p = buf; keylen > 0; keylen--)
+ p = keylen <= sizeof(cookie->inline_key) ?
+ cookie->inline_key : cookie->key;
+ for (; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
if (auxlen > 0) {
if (config & FSCACHE_OBJLIST_CONFIG_KEY)
seq_puts(m, ", ");
+ p = auxlen <= sizeof(cookie->inline_aux) ?
+ cookie->inline_aux : cookie->aux;
for (; auxlen > 0; auxlen--)
seq_printf(m, "%02x", *p++);
}
}
seq_puts(m, "\n");
+ fscache_unuse_cookie(obj);
} else {
seq_puts(m, "<no_netfs>\n");
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 7a182c87f378..20e0d0a4dc8c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -138,10 +138,13 @@ static const struct fscache_transition fscache_osm_run_oob[] = {
{ 0, NULL }
};
-static int fscache_get_object(struct fscache_object *);
-static void fscache_put_object(struct fscache_object *);
+static int fscache_get_object(struct fscache_object *,
+ enum fscache_obj_ref_trace);
+static void fscache_put_object(struct fscache_object *,
+ enum fscache_obj_ref_trace);
static bool fscache_enqueue_dependents(struct fscache_object *, int);
static void fscache_dequeue_object(struct fscache_object *);
+static void fscache_update_aux_data(struct fscache_object *);
/*
* we need to notify the parent when an op completes that we had outstanding
@@ -170,6 +173,7 @@ static void fscache_object_sm_dispatcher(struct fscache_object *object)
const struct fscache_transition *t;
const struct fscache_state *state, *new_state;
unsigned long events, event_mask;
+ bool oob;
int event = -1;
ASSERT(object != NULL);
@@ -188,6 +192,7 @@ restart_masked:
if (events & object->oob_event_mask) {
_debug("{OBJ%x} oob %lx",
object->debug_id, events & object->oob_event_mask);
+ oob = true;
for (t = object->oob_table; t->events; t++) {
if (events & t->events) {
state = t->transit_to;
@@ -199,6 +204,7 @@ restart_masked:
}
}
}
+ oob = false;
/* Wait states are just transition tables */
if (!state->work) {
@@ -207,6 +213,8 @@ restart_masked:
if (events & t->events) {
new_state = t->transit_to;
event = fls(events & t->events) - 1;
+ trace_fscache_osm(object, state,
+ true, false, event);
clear_bit(event, &object->events);
_debug("{OBJ%x} ev %d: %s -> %s",
object->debug_id, event,
@@ -226,6 +234,7 @@ restart_masked:
execute_work_state:
_debug("{OBJ%x} exec %s", object->debug_id, state->name);
+ trace_fscache_osm(object, state, false, oob, event);
new_state = state->work(object, event);
event = -1;
if (new_state == NO_TRANSIT) {
@@ -279,7 +288,7 @@ static void fscache_object_work_func(struct work_struct *work)
start = jiffies;
fscache_object_sm_dispatcher(object);
fscache_hist(fscache_objs_histogram, start);
- fscache_put_object(object);
+ fscache_put_object(object, fscache_obj_put_work);
}
/**
@@ -397,7 +406,7 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje
fscache_stat(&fscache_n_cop_grab_object);
success = false;
if (fscache_object_is_live(parent) &&
- object->cache->ops->grab_object(object)) {
+ object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) {
list_add(&object->dep_link, &parent->dependents);
success = true;
}
@@ -703,6 +712,11 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
ASSERT(cookie != NULL);
ASSERT(!hlist_unhashed(&object->cookie_link));
+ if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) {
+ _debug("final update");
+ fscache_update_aux_data(object);
+ }
+
/* Make sure the cookie no longer points here and that the netfs isn't
* waiting for us.
*/
@@ -745,7 +759,7 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
}
/* this just shifts the object release to the work processor */
- fscache_put_object(object);
+ fscache_put_object(object, fscache_obj_put_drop_obj);
fscache_stat(&fscache_n_object_dead);
_leave("");
@@ -755,12 +769,13 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
/*
* get a ref on an object
*/
-static int fscache_get_object(struct fscache_object *object)
+static int fscache_get_object(struct fscache_object *object,
+ enum fscache_obj_ref_trace why)
{
int ret;
fscache_stat(&fscache_n_cop_grab_object);
- ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+ ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN;
fscache_stat_d(&fscache_n_cop_grab_object);
return ret;
}
@@ -768,10 +783,11 @@ static int fscache_get_object(struct fscache_object *object)
/*
* Discard a ref on an object
*/
-static void fscache_put_object(struct fscache_object *object)
+static void fscache_put_object(struct fscache_object *object,
+ enum fscache_obj_ref_trace why)
{
fscache_stat(&fscache_n_cop_put_object);
- object->cache->ops->put_object(object);
+ object->cache->ops->put_object(object, why);
fscache_stat_d(&fscache_n_cop_put_object);
}
@@ -786,7 +802,7 @@ void fscache_object_destroy(struct fscache_object *object)
fscache_objlist_remove(object);
/* We can get rid of the cookie now */
- fscache_cookie_put(object->cookie);
+ fscache_cookie_put(object->cookie, fscache_cookie_put_object);
object->cookie = NULL;
}
EXPORT_SYMBOL(fscache_object_destroy);
@@ -798,7 +814,7 @@ void fscache_enqueue_object(struct fscache_object *object)
{
_enter("{OBJ%x}", object->debug_id);
- if (fscache_get_object(object) >= 0) {
+ if (fscache_get_object(object, fscache_obj_get_queue) >= 0) {
wait_queue_head_t *cong_wq =
&get_cpu_var(fscache_object_cong_wait);
@@ -806,7 +822,7 @@ void fscache_enqueue_object(struct fscache_object *object)
if (fscache_object_congested())
wake_up(cong_wq);
} else
- fscache_put_object(object);
+ fscache_put_object(object, fscache_obj_put_queue);
put_cpu_var(fscache_object_cong_wait);
}
@@ -866,7 +882,7 @@ static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
list_del_init(&dep->dep_link);
fscache_raise_event(dep, event);
- fscache_put_object(dep);
+ fscache_put_object(dep, fscache_obj_put_enq_dep);
if (!list_empty(&object->dependents) && need_resched()) {
ret = false;
@@ -906,7 +922,8 @@ static void fscache_dequeue_object(struct fscache_object *object)
* and creation).
*/
enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
- const void *data, uint16_t datalen)
+ const void *data, uint16_t datalen,
+ loff_t object_size)
{
enum fscache_checkaux result;
@@ -916,7 +933,7 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
}
result = object->cookie->def->check_aux(object->cookie->netfs_data,
- data, datalen);
+ data, datalen, object_size);
switch (result) {
/* entry okay as is */
case FSCACHE_CHECKAUX_OKAY:
@@ -956,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
* retire the object instead.
*/
if (!fscache_use_cookie(object)) {
- ASSERT(object->cookie->stores.rnode == NULL);
+ ASSERT(radix_tree_empty(&object->cookie->stores));
set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
_leave(" [no cookie]");
return transit_to(KILL_OBJECT);
@@ -972,11 +989,12 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
if (!op)
goto nomem;
- fscache_operation_init(op, object->cache->ops->invalidate_object,
+ fscache_operation_init(cookie, op, object->cache->ops->invalidate_object,
NULL, NULL);
op->flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_EXCLUSIVE) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
+ trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate);
spin_lock(&cookie->lock);
if (fscache_submit_exclusive_op(object, op) < 0)
@@ -1026,6 +1044,17 @@ static const struct fscache_state *fscache_invalidate_object(struct fscache_obje
}
/*
+ * Update auxiliary data.
+ */
+static void fscache_update_aux_data(struct fscache_object *object)
+{
+ fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
+ object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
+}
+
+/*
* Asynchronously update an object.
*/
static const struct fscache_state *fscache_update_object(struct fscache_object *object,
@@ -1033,10 +1062,7 @@ static const struct fscache_state *fscache_update_object(struct fscache_object *
{
_enter("{OBJ%x},%d", object->debug_id, event);
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
+ fscache_update_aux_data(object);
_leave("");
return transit_to(WAIT_FOR_CMD);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index de67745e1cd7..e30c5975ea58 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -32,7 +32,8 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op)
* Do basic initialisation of an operation. The caller must still set flags,
* object and processor if needed.
*/
-void fscache_operation_init(struct fscache_operation *op,
+void fscache_operation_init(struct fscache_cookie *cookie,
+ struct fscache_operation *op,
fscache_operation_processor_t processor,
fscache_operation_cancel_t cancel,
fscache_operation_release_t release)
@@ -46,6 +47,7 @@ void fscache_operation_init(struct fscache_operation *op,
op->release = release;
INIT_LIST_HEAD(&op->pend_link);
fscache_stat(&fscache_n_op_initialised);
+ trace_fscache_op(cookie, op, fscache_op_init);
}
EXPORT_SYMBOL(fscache_operation_init);
@@ -59,6 +61,8 @@ EXPORT_SYMBOL(fscache_operation_init);
*/
void fscache_enqueue_operation(struct fscache_operation *op)
{
+ struct fscache_cookie *cookie = op->object->cookie;
+
_enter("{OBJ%x OP%x,%u}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
@@ -71,12 +75,14 @@ void fscache_enqueue_operation(struct fscache_operation *op)
fscache_stat(&fscache_n_op_enqueue);
switch (op->flags & FSCACHE_OP_TYPE) {
case FSCACHE_OP_ASYNC:
+ trace_fscache_op(cookie, op, fscache_op_enqueue_async);
_debug("queue async");
atomic_inc(&op->usage);
if (!queue_work(fscache_op_wq, &op->work))
fscache_put_operation(op);
break;
case FSCACHE_OP_MYTHREAD:
+ trace_fscache_op(cookie, op, fscache_op_enqueue_mythread);
_debug("queue for caller's attention");
break;
default:
@@ -101,6 +107,8 @@ static void fscache_run_op(struct fscache_object *object,
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
if (op->processor)
fscache_enqueue_operation(op);
+ else
+ trace_fscache_op(object->cookie, op, fscache_op_run);
fscache_stat(&fscache_n_op_run);
}
@@ -155,6 +163,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+ trace_fscache_op(object->cookie, op, fscache_op_submit_ex);
+
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -240,6 +250,8 @@ int fscache_submit_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},{%u}",
object->debug_id, op->debug_id, atomic_read(&op->usage));
+ trace_fscache_op(object->cookie, op, fscache_op_submit);
+
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -357,6 +369,8 @@ int fscache_cancel_op(struct fscache_operation *op,
_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+ trace_fscache_op(object->cookie, op, fscache_op_cancel);
+
ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -419,6 +433,8 @@ void fscache_cancel_all_ops(struct fscache_object *object)
fscache_stat(&fscache_n_op_cancelled);
list_del_init(&op->pend_link);
+ trace_fscache_op(object->cookie, op, fscache_op_cancel_all);
+
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
op->cancel(op);
op->state = FSCACHE_OP_ST_CANCELLED;
@@ -454,9 +470,11 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled)
spin_lock(&object->lock);
if (!cancelled) {
+ trace_fscache_op(object->cookie, op, fscache_op_completed);
op->state = FSCACHE_OP_ST_COMPLETE;
} else {
op->cancel(op);
+ trace_fscache_op(object->cookie, op, fscache_op_cancelled);
op->state = FSCACHE_OP_ST_CANCELLED;
}
@@ -488,6 +506,8 @@ void fscache_put_operation(struct fscache_operation *op)
if (!atomic_dec_and_test(&op->usage))
return;
+ trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put);
+
_debug("PUT OP");
ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
op->state != FSCACHE_OP_ST_COMPLETE,
@@ -563,6 +583,8 @@ void fscache_operation_gc(struct work_struct *work)
spin_unlock(&cache->op_gc_list_lock);
object = op->object;
+ trace_fscache_op(object->cookie, op, fscache_op_gc);
+
spin_lock(&object->lock);
_debug("GC DEFERRED REL OBJ%x OP%x",
@@ -601,6 +623,8 @@ void fscache_op_work_func(struct work_struct *work)
_enter("{OBJ%x OP%x,%d}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+ trace_fscache_op(op->object->cookie, op, fscache_op_work);
+
ASSERT(op->processor != NULL);
start = jiffies;
op->processor(op);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 961029e04027..111349f67d98 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -27,6 +27,7 @@ bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page
rcu_read_lock();
val = radix_tree_lookup(&cookie->stores, page->index);
rcu_read_unlock();
+ trace_fscache_check_page(cookie, page, val, 0);
return val != NULL;
}
@@ -39,6 +40,8 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
{
wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+ trace_fscache_page(cookie, page, fscache_page_write_wait);
+
wait_event(*wq, !__fscache_check_page_write(cookie, page));
}
EXPORT_SYMBOL(__fscache_wait_on_page_write);
@@ -69,6 +72,8 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
_enter("%p,%p,%x", cookie, page, gfp);
+ trace_fscache_page(cookie, page, fscache_page_maybe_release);
+
try_again:
rcu_read_lock();
val = radix_tree_lookup(&cookie->stores, page->index);
@@ -101,6 +106,7 @@ try_again:
}
xpage = radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
spin_unlock(&cookie->stores_lock);
if (xpage) {
@@ -112,6 +118,7 @@ try_again:
}
wake_up_bit(&cookie->flags, 0);
+ trace_fscache_wake_cookie(cookie);
if (xpage)
put_page(xpage);
__fscache_uncache_page(cookie, page);
@@ -144,7 +151,7 @@ static void fscache_end_page_write(struct fscache_object *object,
struct page *page)
{
struct fscache_cookie *cookie;
- struct page *xpage = NULL;
+ struct page *xpage = NULL, *val;
spin_lock(&object->lock);
cookie = object->cookie;
@@ -154,13 +161,24 @@ static void fscache_end_page_write(struct fscache_object *object,
spin_lock(&cookie->stores_lock);
radix_tree_tag_clear(&cookie->stores, page->index,
FSCACHE_COOKIE_STORING_TAG);
+ trace_fscache_page(cookie, page, fscache_page_radix_clear_store);
if (!radix_tree_tag_get(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG)) {
fscache_stat(&fscache_n_store_radix_deletes);
xpage = radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
+ trace_fscache_page(cookie, page, fscache_page_write_end);
+
+ val = radix_tree_lookup(&cookie->stores, page->index);
+ trace_fscache_check_page(cookie, page, val, 1);
+ } else {
+ trace_fscache_page(cookie, page, fscache_page_write_end_pend);
}
spin_unlock(&cookie->stores_lock);
wake_up_bit(&cookie->flags, 0);
+ trace_fscache_wake_cookie(cookie);
+ } else {
+ trace_fscache_page(cookie, page, fscache_page_write_end_noc);
}
spin_unlock(&object->lock);
if (xpage)
@@ -185,9 +203,11 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat_d(&fscache_n_cop_attr_changed);
if (ret < 0)
fscache_abort_object(object);
+ fscache_op_complete(op, ret < 0);
+ } else {
+ fscache_op_complete(op, true);
}
- fscache_op_complete(op, true);
_leave("");
}
@@ -213,7 +233,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
return -ENOMEM;
}
- fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL);
+ fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL);
+ trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed);
op->flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_EXCLUSIVE) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -297,7 +318,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
return NULL;
}
- fscache_operation_init(&op->op, NULL,
+ fscache_operation_init(cookie, &op->op, NULL,
fscache_do_cancel_retrieval,
fscache_release_retrieval_op);
op->op.flags = FSCACHE_OP_MYTHREAD |
@@ -368,6 +389,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
fscache_stat(stat_op_waits);
if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
TASK_INTERRUPTIBLE) != 0) {
+ trace_fscache_op(object->cookie, op, fscache_op_signal);
ret = fscache_cancel_op(op, false);
if (ret == 0)
return -ERESTARTSYS;
@@ -389,6 +411,7 @@ check_if_dead:
if (unlikely(fscache_object_is_dying(object) ||
fscache_cache_is_broken(object))) {
enum fscache_operation_state state = op->state;
+ trace_fscache_op(object->cookie, op, fscache_op_signal);
fscache_cancel_op(op, true);
if (stat_object_dead)
fscache_stat(stat_object_dead);
@@ -443,6 +466,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
return -ENOMEM;
}
atomic_set(&op->n_pages, 1);
+ trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one);
spin_lock(&cookie->lock);
@@ -571,6 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (!op)
return -ENOMEM;
atomic_set(&op->n_pages, *nr_pages);
+ trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi);
spin_lock(&cookie->lock);
@@ -682,6 +707,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
if (!op)
return -ENOMEM;
atomic_set(&op->n_pages, 1);
+ trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one);
spin_lock(&cookie->lock);
@@ -776,15 +802,17 @@ static void fscache_write_op(struct fscache_operation *_op)
_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+again:
spin_lock(&object->lock);
cookie = object->cookie;
if (!fscache_object_is_active(object)) {
- /* If we get here, then the on-disk cache object likely longer
- * exists, so we should just cancel this write operation.
+ /* If we get here, then the on-disk cache object likely no
+ * longer exists, so we should just cancel this write
+ * operation.
*/
spin_unlock(&object->lock);
- fscache_op_complete(&op->op, false);
+ fscache_op_complete(&op->op, true);
_leave(" [inactive]");
return;
}
@@ -797,7 +825,7 @@ static void fscache_write_op(struct fscache_operation *_op)
* cancel this write operation.
*/
spin_unlock(&object->lock);
- fscache_op_complete(&op->op, false);
+ fscache_op_complete(&op->op, true);
_leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
_op->flags, _op->state, object->state->short_name,
object->flags);
@@ -809,30 +837,33 @@ static void fscache_write_op(struct fscache_operation *_op)
fscache_stat(&fscache_n_store_calls);
/* find a page to store */
+ results[0] = NULL;
page = NULL;
n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
FSCACHE_COOKIE_PENDING_TAG);
+ trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit);
if (n != 1)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index >= op->store_limit) {
- fscache_stat(&fscache_n_store_pages_over_limit);
- goto superseded;
- }
radix_tree_tag_set(&cookie->stores, page->index,
FSCACHE_COOKIE_STORING_TAG);
radix_tree_tag_clear(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG);
+ trace_fscache_page(cookie, page, fscache_page_radix_pend2store);
spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
+ if (page->index >= op->store_limit)
+ goto discard_page;
+
fscache_stat(&fscache_n_store_pages);
fscache_stat(&fscache_n_cop_write_page);
ret = object->cache->ops->write_page(op, page);
fscache_stat_d(&fscache_n_cop_write_page);
+ trace_fscache_wrote_page(cookie, page, &op->op, ret);
fscache_end_page_write(object, page);
if (ret < 0) {
fscache_abort_object(object);
@@ -844,6 +875,12 @@ static void fscache_write_op(struct fscache_operation *_op)
_leave("");
return;
+discard_page:
+ fscache_stat(&fscache_n_store_pages_over_limit);
+ trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS);
+ fscache_end_page_write(object, page);
+ goto again;
+
superseded:
/* this writer is going away and there aren't any more things to
* write */
@@ -851,7 +888,7 @@ superseded:
spin_unlock(&cookie->stores_lock);
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
spin_unlock(&object->lock);
- fscache_op_complete(&op->op, true);
+ fscache_op_complete(&op->op, false);
_leave("");
}
@@ -879,6 +916,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
for (i = n - 1; i >= 0; i--) {
page = results[i];
radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
+ trace_fscache_page(cookie, page, fscache_page_inval);
}
spin_unlock(&cookie->stores_lock);
@@ -888,6 +927,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
}
wake_up_bit(&cookie->flags, 0);
+ trace_fscache_wake_cookie(cookie);
_leave("");
}
@@ -923,6 +963,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
*/
int __fscache_write_page(struct fscache_cookie *cookie,
struct page *page,
+ loff_t object_size,
gfp_t gfp)
{
struct fscache_storage *op;
@@ -946,7 +987,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (!op)
goto nomem;
- fscache_operation_init(&op->op, fscache_write_op, NULL,
+ fscache_operation_init(cookie, &op->op, fscache_write_op, NULL,
fscache_release_write_op);
op->op.flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_WAITING) |
@@ -956,6 +997,8 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (ret < 0)
goto nomem_free;
+ trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one);
+
ret = -ENOBUFS;
spin_lock(&cookie->lock);
@@ -967,9 +1010,15 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
goto nobufs;
+ trace_fscache_page(cookie, page, fscache_page_write);
+
/* add the page to the pending-storage radix tree on the backing
* object */
spin_lock(&object->lock);
+
+ if (object->store_limit_l != object_size)
+ fscache_set_store_limit(object, object_size);
+
spin_lock(&cookie->stores_lock);
_debug("store limit %llx", (unsigned long long) object->store_limit);
@@ -982,8 +1031,10 @@ int __fscache_write_page(struct fscache_cookie *cookie,
goto nobufs_unlock_obj;
}
+ trace_fscache_page(cookie, page, fscache_page_radix_insert);
radix_tree_tag_set(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG);
+ trace_fscache_page(cookie, page, fscache_page_radix_set_pend);
get_page(page);
/* we only want one writer at a time, but we do need to queue new
@@ -1026,6 +1077,7 @@ already_pending:
submit_failed:
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
+ trace_fscache_page(cookie, page, fscache_page_radix_delete);
spin_unlock(&cookie->stores_lock);
wake_cookie = __fscache_unuse_cookie(cookie);
put_page(page);
@@ -1072,6 +1124,8 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
if (!PageFsCache(page))
goto done;
+ trace_fscache_page(cookie, page, fscache_page_uncache);
+
/* get the object */
spin_lock(&cookie->lock);
@@ -1120,6 +1174,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
atomic_inc(&fscache_n_marks);
#endif
+ trace_fscache_page(cookie, page, fscache_page_cached);
+
_debug("- mark %p{%lx}", page, page->index);
if (TestSetPageFsCache(page)) {
static bool once_only;
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index 1d9e4951a597..49a8c90414bc 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -26,14 +26,14 @@ int __init fscache_proc_init(void)
goto error_dir;
#ifdef CONFIG_FSCACHE_STATS
- if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
- &fscache_stats_fops))
+ if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
+ fscache_stats_show))
goto error_stats;
#endif
#ifdef CONFIG_FSCACHE_HISTOGRAM
- if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
- &fscache_histogram_fops))
+ if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL,
+ &fscache_histogram_ops))
goto error_histogram;
#endif
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 7ac6e839b065..00564a1dfd76 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -21,7 +21,6 @@
atomic_t fscache_n_op_pend;
atomic_t fscache_n_op_run;
atomic_t fscache_n_op_enqueue;
-atomic_t fscache_n_op_requeue;
atomic_t fscache_n_op_deferred_release;
atomic_t fscache_n_op_initialised;
atomic_t fscache_n_op_release;
@@ -139,7 +138,7 @@ atomic_t fscache_n_cache_culled_objects;
/*
* display the general statistics
*/
-static int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m, void *v)
{
seq_puts(m, "FS-Cache statistics\n");
@@ -285,18 +284,3 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_cache_culled_objects));
return 0;
}
-
-/*
- * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
- */
-static int fscache_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, fscache_stats_show, NULL);
-}
-
-const struct file_operations fscache_stats_fops = {
- .open = fscache_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 624f18bbfd2b..ef309958e060 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1080,6 +1080,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_time_gran = 1;
sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
file = fget(d.fd);
err = -EINVAL;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f58716567972..35f5ee23566d 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -54,8 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
continue;
if (start >= to)
break;
- if (gfs2_is_jdata(ip))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
gfs2_trans_add_data(ip->i_gl, bh);
}
}
@@ -747,18 +746,21 @@ out:
put_page(page);
gfs2_trans_end(sdp);
- if (pos + len > ip->i_inode.i_size)
- gfs2_trim_blocks(&ip->i_inode);
- goto out_trans_fail;
+ if (alloc_required) {
+ gfs2_inplace_release(ip);
+ if (pos + len > ip->i_inode.i_size)
+ gfs2_trim_blocks(&ip->i_inode);
+ }
+ goto out_qunlock;
out_endtrans:
gfs2_trans_end(sdp);
out_trans_fail:
- if (alloc_required) {
+ if (alloc_required)
gfs2_inplace_release(ip);
out_qunlock:
+ if (alloc_required)
gfs2_quota_unlock(ip);
- }
out_unlock:
if (&ip->i_inode == sdp->sd_rindex) {
gfs2_glock_dq(&m_ip->i_gh);
@@ -814,7 +816,6 @@ out:
* @inode: The inode
* @dibh: The buffer_head containing the on-disk inode
* @pos: The file position
- * @len: The length of the write
* @copied: How much was actually copied by the VFS
* @page: The page
*
@@ -824,17 +825,15 @@ out:
* Returns: errno
*/
static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
- loff_t pos, unsigned len, unsigned copied,
+ loff_t pos, unsigned copied,
struct page *page)
{
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
u64 to = pos + copied;
void *kaddr;
unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
- BUG_ON(pos + len > gfs2_max_stuffed_size(ip));
+ BUG_ON(pos + copied > gfs2_max_stuffed_size(ip));
kaddr = kmap_atomic(page);
memcpy(buf + pos, kaddr + pos, copied);
@@ -850,20 +849,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
i_size_write(inode, to);
mark_inode_dirty(inode);
}
-
- if (inode == sdp->sd_rindex) {
- adjust_fs_space(inode);
- sdp->sd_rindex_uptodate = 0;
- }
-
- brelse(dibh);
- gfs2_trans_end(sdp);
- if (inode == sdp->sd_rindex) {
- gfs2_glock_dq(&m_ip->i_gh);
- gfs2_holder_uninit(&m_ip->i_gh);
- }
- gfs2_glock_dq(&ip->i_gh);
- gfs2_holder_uninit(&ip->i_gh);
return copied;
}
@@ -877,9 +862,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
* @page: The page that has been written
* @fsdata: The fsdata (unused in GFS2)
*
- * The main write_end function for GFS2. We have a separate one for
- * stuffed files as they are slightly different, otherwise we just
- * put our locking around the VFS provided functions.
+ * The main write_end function for GFS2. We just put our locking around the VFS
+ * provided functions.
*
* Returns: errno
*/
@@ -900,32 +884,39 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
ret = gfs2_meta_inode_buffer(ip, &dibh);
- if (unlikely(ret)) {
- unlock_page(page);
- put_page(page);
- goto failed;
- }
+ if (unlikely(ret))
+ goto out;
- if (gfs2_is_stuffed(ip))
- return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+ if (gfs2_is_stuffed(ip)) {
+ ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page);
+ page = NULL;
+ goto out2;
+ }
- if (!gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len);
+ else
+ gfs2_ordered_add_inode(ip);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ page = NULL;
if (tr->tr_num_buf_new)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
else
gfs2_trans_add_meta(ip->i_gl, dibh);
-
+out2:
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
sdp->sd_rindex_uptodate = 0;
}
brelse(dibh);
-failed:
+out:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
if (ip->i_qadata && ip->i_qadata->qa_qd_num)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 685c305cbeb6..ed6699705c13 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -89,10 +89,12 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
map_bh(bh, inode->i_sb, block);
set_buffer_uptodate(bh);
- if (!gfs2_is_jdata(ip))
- mark_buffer_dirty(bh);
- if (!gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
+ else {
+ mark_buffer_dirty(bh);
+ gfs2_ordered_add_inode(ip);
+ }
if (release) {
unlock_page(page);
@@ -176,8 +178,8 @@ out:
/**
* find_metapath - Find path through the metadata tree
* @sdp: The superblock
- * @mp: The metapath to return the result in
* @block: The disk block to look up
+ * @mp: The metapath to return the result in
* @height: The pre-calculated height of the metadata tree
*
* This routine returns a struct metapath structure that defines a path
@@ -188,8 +190,7 @@ out:
* filesystem with a blocksize of 4096.
*
* find_metapath() would return a struct metapath structure set to:
- * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
- * and mp_list[2] = 165.
+ * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
*
* That means that in order to get to the block containing the byte at
* offset 101342453, we would load the indirect block pointed to by pointer
@@ -279,6 +280,21 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
return p + mp->mp_list[height];
}
+static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
+{
+ const struct buffer_head *bh = mp->mp_bh[height];
+ return (const __be64 *)(bh->b_data + bh->b_size);
+}
+
+static void clone_metapath(struct metapath *clone, struct metapath *mp)
+{
+ unsigned int hgt;
+
+ *clone = *mp;
+ for (hgt = 0; hgt < mp->mp_aheight; hgt++)
+ get_bh(clone->mp_bh[hgt]);
+}
+
static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
{
const __be64 *t;
@@ -420,20 +436,140 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b
return (ptr - first);
}
-static inline void bmap_lock(struct gfs2_inode *ip, int create)
+typedef const __be64 *(*gfs2_metadata_walker)(
+ struct metapath *mp,
+ const __be64 *start, const __be64 *end,
+ u64 factor, void *data);
+
+#define WALK_STOP ((__be64 *)0)
+#define WALK_NEXT ((__be64 *)1)
+
+static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
+ u64 len, struct metapath *mp, gfs2_metadata_walker walker,
+ void *data)
{
- if (create)
- down_write(&ip->i_rw_mutex);
- else
- down_read(&ip->i_rw_mutex);
+ struct metapath clone;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ const __be64 *start, *end, *ptr;
+ u64 factor = 1;
+ unsigned int hgt;
+ int ret = 0;
+
+ for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
+ factor *= sdp->sd_inptrs;
+
+ for (;;) {
+ u64 step;
+
+ /* Walk indirect block. */
+ start = metapointer(hgt, mp);
+ end = metaend(hgt, mp);
+
+ step = (end - start) * factor;
+ if (step > len)
+ end = start + DIV_ROUND_UP_ULL(len, factor);
+
+ ptr = walker(mp, start, end, factor, data);
+ if (ptr == WALK_STOP)
+ break;
+ if (step >= len)
+ break;
+ len -= step;
+ if (ptr != WALK_NEXT) {
+ BUG_ON(!*ptr);
+ mp->mp_list[hgt] += ptr - start;
+ goto fill_up_metapath;
+ }
+
+lower_metapath:
+ /* Decrease height of metapath. */
+ if (mp != &clone) {
+ clone_metapath(&clone, mp);
+ mp = &clone;
+ }
+ brelse(mp->mp_bh[hgt]);
+ mp->mp_bh[hgt] = NULL;
+ if (!hgt)
+ break;
+ hgt--;
+ factor *= sdp->sd_inptrs;
+
+ /* Advance in metadata tree. */
+ (mp->mp_list[hgt])++;
+ start = metapointer(hgt, mp);
+ end = metaend(hgt, mp);
+ if (start >= end) {
+ mp->mp_list[hgt] = 0;
+ if (!hgt)
+ break;
+ goto lower_metapath;
+ }
+
+fill_up_metapath:
+ /* Increase height of metapath. */
+ if (mp != &clone) {
+ clone_metapath(&clone, mp);
+ mp = &clone;
+ }
+ ret = fillup_metapath(ip, mp, ip->i_height - 1);
+ if (ret < 0)
+ break;
+ hgt += ret;
+ for (; ret; ret--)
+ do_div(factor, sdp->sd_inptrs);
+ mp->mp_aheight = hgt + 1;
+ }
+ if (mp == &clone)
+ release_metapath(mp);
+ return ret;
}
-static inline void bmap_unlock(struct gfs2_inode *ip, int create)
+struct gfs2_hole_walker_args {
+ u64 blocks;
+};
+
+static const __be64 *gfs2_hole_walker(struct metapath *mp,
+ const __be64 *start, const __be64 *end,
+ u64 factor, void *data)
{
- if (create)
- up_write(&ip->i_rw_mutex);
- else
- up_read(&ip->i_rw_mutex);
+ struct gfs2_hole_walker_args *args = data;
+ const __be64 *ptr;
+
+ for (ptr = start; ptr < end; ptr++) {
+ if (*ptr) {
+ args->blocks += (ptr - start) * factor;
+ if (mp->mp_aheight == mp->mp_fheight)
+ return WALK_STOP;
+ return ptr; /* increase height */
+ }
+ }
+ args->blocks += (end - start) * factor;
+ return WALK_NEXT;
+}
+
+/**
+ * gfs2_hole_size - figure out the size of a hole
+ * @inode: The inode
+ * @lblock: The logical starting block number
+ * @len: How far to look (in blocks)
+ * @mp: The metapath at lblock
+ * @iomap: The iomap to store the hole size in
+ *
+ * This function modifies @mp.
+ *
+ * Returns: errno on error
+ */
+static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
+ struct metapath *mp, struct iomap *iomap)
+{
+ struct gfs2_hole_walker_args args = { };
+ int ret = 0;
+
+ ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
+ if (!ret)
+ iomap->length = args.blocks << inode->i_blkbits;
+ return ret;
}
static inline __be64 *gfs2_indirect_init(struct metapath *mp,
@@ -462,15 +598,11 @@ enum alloc_state {
};
/**
- * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * gfs2_iomap_alloc - Build a metadata tree of the requested height
* @inode: The GFS2 inode
- * @lblock: The logical starting block of the extent
- * @bh_map: This is used to return the mapping details
- * @zero_new: True if newly allocated blocks should be zeroed
+ * @iomap: The iomap structure
+ * @flags: iomap flags
* @mp: The metapath, with proper height information calculated
- * @maxlen: The max number of data blocks to alloc
- * @dblock: Pointer to return the resulting new block
- * @dblks: Pointer to return the number of blocks allocated
*
* In this routine we may have to alloc:
* i) Indirect blocks to grow the metadata tree height
@@ -483,6 +615,13 @@ enum alloc_state {
* blocks are available, there will only be one request per bmap call)
* and uses the state machine to initialise the blocks in order.
*
+ * Right now, this function will allocate at most one indirect block
+ * worth of data -- with a default block size of 4K, that's slightly
+ * less than 2M. If this limitation is ever removed to allow huge
+ * allocations, we would probably still want to limit the iomap size we
+ * return to avoid stalling other tasks during huge writes; the next
+ * iomap iteration would then find the blocks already allocated.
+ *
* Returns: errno on error
*/
@@ -497,6 +636,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = mp->mp_fheight - 1;
+ int ret;
enum alloc_state state;
__be64 *ptr;
__be64 zero_bn = 0;
@@ -507,6 +647,8 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
gfs2_trans_add_meta(ip->i_gl, dibh);
+ down_write(&ip->i_rw_mutex);
+
if (mp->mp_fheight == mp->mp_aheight) {
struct buffer_head *bh;
int eob;
@@ -542,11 +684,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
blks = dblks + iblks;
i = mp->mp_aheight;
do {
- int error;
n = blks - alloced;
- error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
- if (error)
- return error;
+ ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+ if (ret)
+ goto out;
alloced += n;
if (state != ALLOC_DATA || gfs2_is_jdata(ip))
gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -602,7 +743,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
dblks = n;
ptr = metapointer(end_of_metadata, mp);
iomap->addr = bn << inode->i_blkbits;
- iomap->flags |= IOMAP_F_NEW;
+ iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
while (n-- > 0)
*ptr++ = cpu_to_be64(bn++);
break;
@@ -612,64 +753,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
iomap->length = (u64)dblks << inode->i_blkbits;
ip->i_height = mp->mp_fheight;
gfs2_add_inode_blocks(&ip->i_inode, alloced);
- gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
- return 0;
-}
-
-/**
- * hole_size - figure out the size of a hole
- * @inode: The inode
- * @lblock: The logical starting block number
- * @mp: The metapath
- *
- * Returns: The hole size in bytes
- *
- */
-static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct metapath mp_eof;
- u64 factor = 1;
- int hgt;
- u64 holesz = 0;
- const __be64 *first, *end, *ptr;
- const struct buffer_head *bh;
- u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
- int zeroptrs;
- bool done = false;
-
- /* Get another metapath, to the very last byte */
- find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
- for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
- bh = mp->mp_bh[hgt];
- if (bh) {
- zeroptrs = 0;
- first = metapointer(hgt, mp);
- end = (const __be64 *)(bh->b_data + bh->b_size);
-
- for (ptr = first; ptr < end; ptr++) {
- if (*ptr) {
- done = true;
- break;
- } else {
- zeroptrs++;
- }
- }
- } else {
- zeroptrs = sdp->sd_inptrs;
- }
- if (factor * zeroptrs >= lblock_stop - lblock + 1) {
- holesz = lblock_stop - lblock + 1;
- break;
- }
- holesz += factor * zeroptrs;
-
- factor *= sdp->sd_inptrs;
- if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
- (mp->mp_list[hgt - 1])++;
- }
- return holesz << inode->i_blkbits;
+ gfs2_dinode_out(ip, dibh->b_data);
+out:
+ up_write(&ip->i_rw_mutex);
+ return ret;
}
static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
@@ -680,126 +767,136 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
sizeof(struct gfs2_dinode);
iomap->offset = 0;
iomap->length = i_size_read(inode);
- iomap->type = IOMAP_MAPPED;
- iomap->flags = IOMAP_F_DATA_INLINE;
+ iomap->type = IOMAP_INLINE;
}
+#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
+
/**
- * gfs2_iomap_begin - Map blocks from an inode to disk blocks
+ * gfs2_iomap_get - Map blocks from an inode to disk blocks
* @inode: The inode
* @pos: Starting position in bytes
* @length: Length to map, in bytes
* @flags: iomap flags
* @iomap: The iomap structure
+ * @mp: The metapath
*
* Returns: errno
*/
-int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
- unsigned flags, struct iomap *iomap)
+static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap,
+ struct metapath *mp)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct metapath mp = { .mp_aheight = 1, };
- unsigned int factor = sdp->sd_sb.sb_bsize;
- const u64 *arr = sdp->sd_heightsize;
__be64 *ptr;
sector_t lblock;
- sector_t lend;
- int ret = 0;
+ sector_t lblock_stop;
+ int ret;
int eob;
- unsigned int len;
+ u64 len;
struct buffer_head *bh;
u8 height;
- trace_gfs2_iomap_start(ip, pos, length, flags);
- if (!length) {
- ret = -EINVAL;
- goto out;
- }
+ if (!length)
+ return -EINVAL;
if (gfs2_is_stuffed(ip)) {
if (flags & IOMAP_REPORT) {
+ if (pos >= i_size_read(inode))
+ return -ENOENT;
gfs2_stuffed_iomap(inode, iomap);
- if (pos >= iomap->length)
- ret = -ENOENT;
- goto out;
+ return 0;
}
BUG_ON(!(flags & IOMAP_WRITE));
}
-
lblock = pos >> inode->i_blkbits;
- lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
-
iomap->offset = lblock << inode->i_blkbits;
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
- iomap->flags = IOMAP_F_MERGED;
- bmap_lock(ip, flags & IOMAP_WRITE);
+ lblock_stop = (pos + length - 1) >> inode->i_blkbits;
+ len = lblock_stop - lblock + 1;
- /*
- * Directory data blocks have a struct gfs2_meta_header header, so the
- * remaining size is smaller than the filesystem block size. Logical
- * block numbers for directories are in units of this remaining size!
- */
- if (gfs2_is_dir(ip)) {
- factor = sdp->sd_jbsize;
- arr = sdp->sd_jheightsize;
- }
+ down_read(&ip->i_rw_mutex);
- ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+ ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]);
if (ret)
- goto out_release;
+ goto unlock;
height = ip->i_height;
- while ((lblock + 1) * factor > arr[height])
+ while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
height++;
- find_metapath(sdp, lblock, &mp, height);
+ find_metapath(sdp, lblock, mp, height);
if (height > ip->i_height || gfs2_is_stuffed(ip))
goto do_alloc;
- ret = lookup_metapath(ip, &mp);
+ ret = lookup_metapath(ip, mp);
if (ret)
- goto out_release;
+ goto unlock;
- if (mp.mp_aheight != ip->i_height)
+ if (mp->mp_aheight != ip->i_height)
goto do_alloc;
- ptr = metapointer(ip->i_height - 1, &mp);
+ ptr = metapointer(ip->i_height - 1, mp);
if (*ptr == 0)
goto do_alloc;
- iomap->type = IOMAP_MAPPED;
- iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+ bh = mp->mp_bh[ip->i_height - 1];
+ len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob);
- bh = mp.mp_bh[ip->i_height - 1];
- len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
+ iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
+ iomap->length = len << inode->i_blkbits;
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_MERGED;
if (eob)
- iomap->flags |= IOMAP_F_BOUNDARY;
- iomap->length = (u64)len << inode->i_blkbits;
+ iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
-out_release:
- release_metapath(&mp);
- bmap_unlock(ip, flags & IOMAP_WRITE);
out:
- trace_gfs2_iomap_end(ip, iomap, ret);
+ iomap->bdev = inode->i_sb->s_bdev;
+unlock:
+ up_read(&ip->i_rw_mutex);
return ret;
do_alloc:
- if (flags & IOMAP_WRITE) {
- ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
- } else if (flags & IOMAP_REPORT) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = len << inode->i_blkbits;
+ iomap->type = IOMAP_HOLE;
+ iomap->flags = 0;
+ if (flags & IOMAP_REPORT) {
loff_t size = i_size_read(inode);
if (pos >= size)
ret = -ENOENT;
- else if (height <= ip->i_height)
- iomap->length = hole_size(inode, lblock, &mp);
+ else if (height == ip->i_height)
+ ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
else
iomap->length = size - pos;
}
- goto out_release;
+ goto out;
+}
+
+static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned flags, struct iomap *iomap)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct metapath mp = { .mp_aheight = 1, };
+ int ret;
+
+ trace_gfs2_iomap_start(ip, pos, length, flags);
+ if (flags & IOMAP_WRITE) {
+ ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+ if (!ret && iomap->type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+ release_metapath(&mp);
+ } else {
+ ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+ release_metapath(&mp);
+ }
+ trace_gfs2_iomap_end(ip, iomap, ret);
+ return ret;
}
+const struct iomap_ops gfs2_iomap_ops = {
+ .iomap_begin = gfs2_iomap_begin,
+};
+
/**
* gfs2_block_map - Map one or more blocks of an inode to a disk block
* @inode: The inode
@@ -825,34 +922,43 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
struct buffer_head *bh_map, int create)
{
struct gfs2_inode *ip = GFS2_I(inode);
- struct iomap iomap;
- int ret, flags = 0;
+ loff_t pos = (loff_t)lblock << inode->i_blkbits;
+ loff_t length = bh_map->b_size;
+ struct metapath mp = { .mp_aheight = 1, };
+ struct iomap iomap = { };
+ int ret;
clear_buffer_mapped(bh_map);
clear_buffer_new(bh_map);
clear_buffer_boundary(bh_map);
trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
- if (create)
- flags |= IOMAP_WRITE;
- ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
- bh_map->b_size, flags, &iomap);
- if (ret) {
- if (!create && ret == -ENOENT) {
- /* Return unmapped buffer beyond the end of file. */
+ if (create) {
+ ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
+ if (!ret && iomap.type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp);
+ release_metapath(&mp);
+ } else {
+ ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
+ release_metapath(&mp);
+
+ /* Return unmapped buffer beyond the end of file. */
+ if (ret == -ENOENT) {
ret = 0;
+ goto out;
}
- goto out;
}
+ if (ret)
+ goto out;
if (iomap.length > bh_map->b_size) {
iomap.length = bh_map->b_size;
- iomap.flags &= ~IOMAP_F_BOUNDARY;
+ iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
}
if (iomap.addr != IOMAP_NULL_ADDR)
map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
bh_map->b_size = iomap.length;
- if (iomap.flags & IOMAP_F_BOUNDARY)
+ if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
set_buffer_boundary(bh_map);
if (iomap.flags & IOMAP_F_NEW)
set_buffer_new(bh_map);
@@ -945,8 +1051,10 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
err = 0;
}
- if (!gfs2_is_writeback(ip))
+ if (gfs2_is_jdata(ip))
gfs2_trans_add_data(ip->i_gl, bh);
+ else
+ gfs2_ordered_add_inode(ip);
zero_user(page, offset, length);
mark_buffer_dirty(bh);
@@ -1056,6 +1164,19 @@ out:
return error;
}
+int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap)
+{
+ struct metapath mp = { .mp_aheight = 1, };
+ int ret;
+
+ ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
+ if (!ret && iomap->type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp);
+ release_metapath(&mp);
+ return ret;
+}
+
/**
* sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
* @ip: inode
@@ -1744,7 +1865,7 @@ do_grow_qunlock:
* @newsize: the size to make the file
*
* The file size can grow, shrink, or stay the same size. This
- * is called holding i_mutex and an exclusive glock on the inode
+ * is called holding i_rwsem and an exclusive glock on the inode
* in question.
*
* Returns: errno
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c3402fe00653..6b18fb323f0a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,11 +46,13 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
}
}
+extern const struct iomap_ops gfs2_iomap_ops;
+
extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
extern int gfs2_block_map(struct inode *inode, sector_t lblock,
struct buffer_head *bh, int create);
-extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
- unsigned flags, struct iomap *iomap);
+extern int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap);
extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
u64 *dblock, unsigned *extlen);
extern int gfs2_setattr_size(struct inode *inode, u64 size);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b71f021a9e2..7137db7b0119 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -733,7 +733,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
struct gfs2_inode *ip = GFS2_I(inode);
loff_t end = offset + len;
struct buffer_head *dibh;
- struct iomap iomap;
+ struct iomap iomap = { };
int error;
error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -749,8 +749,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
}
while (offset < end) {
- error = gfs2_iomap_begin(inode, offset, end - offset,
- IOMAP_WRITE, &iomap);
+ error = gfs2_iomap_get_alloc(inode, offset, end - offset,
+ &iomap);
if (error)
goto out;
offset = iomap.offset + iomap.length;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 82fb5583445c..097bd3c0f270 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1923,28 +1923,37 @@ void gfs2_glock_exit(void)
static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n)
{
- if (n == 0)
- gi->gl = rhashtable_walk_peek(&gi->hti);
- else {
- gi->gl = rhashtable_walk_next(&gi->hti);
- n--;
+ struct gfs2_glock *gl = gi->gl;
+
+ if (gl) {
+ if (n == 0)
+ return;
+ if (!lockref_put_not_zero(&gl->gl_lockref))
+ gfs2_glock_queue_put(gl);
}
for (;;) {
- if (IS_ERR_OR_NULL(gi->gl)) {
- if (!gi->gl)
- return;
- if (PTR_ERR(gi->gl) != -EAGAIN) {
- gi->gl = NULL;
- return;
+ gl = rhashtable_walk_next(&gi->hti);
+ if (IS_ERR_OR_NULL(gl)) {
+ if (gl == ERR_PTR(-EAGAIN)) {
+ n = 1;
+ continue;
}
- n = 0;
- } else if (gi->sdp == gi->gl->gl_name.ln_sbd &&
- !__lockref_is_dead(&gi->gl->gl_lockref)) {
- if (!n--)
- break;
+ gl = NULL;
+ break;
+ }
+ if (gl->gl_name.ln_sbd != gi->sdp)
+ continue;
+ if (n <= 1) {
+ if (!lockref_get_not_dead(&gl->gl_lockref))
+ continue;
+ break;
+ } else {
+ if (__lockref_is_dead(&gl->gl_lockref))
+ continue;
+ n--;
}
- gi->gl = rhashtable_walk_next(&gi->hti);
}
+ gi->gl = gl;
}
static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
@@ -1988,7 +1997,6 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock_iter *gi = seq->private;
- gi->gl = NULL;
rhashtable_walk_stop(&gi->hti);
}
@@ -2076,7 +2084,8 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
- gi->gl = NULL;
+ if (gi->gl)
+ gfs2_glock_put(gi->gl);
rhashtable_walk_exit(&gi->hti);
return seq_release_private(inode, file);
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1b6b1e3f5caf..d2ad817e089f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -116,6 +116,7 @@ static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
{
+ BUG_ON(rbm->offset >= rbm->rgd->rd_data);
return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
rbm->offset;
}
@@ -696,8 +697,6 @@ struct gfs2_sbd {
u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
u32 sd_max_height; /* Max height of a file's metadata tree */
u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
- u32 sd_max_jheight; /* Max height of journaled file's meta tree */
- u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
struct gfs2_args sd_args; /* Mount arguments */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8700eb815638..feda55f67050 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2006,10 +2006,6 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
return 0;
}
-const struct iomap_ops gfs2_iomap_ops = {
- .iomap_begin = gfs2_iomap_begin,
-};
-
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 1862e310a067..20241436126d 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/writeback.h>
#include "incore.h"
+#include "inode.h"
/**
* gfs2_log_lock - acquire the right to mess with the log manager
@@ -50,8 +51,12 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_sbd *sdp;
+ if (!gfs2_is_ordered(ip))
+ return;
+
+ sdp = GFS2_SB(&ip->i_inode);
if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
spin_lock(&sdp->sd_ordered_lock);
if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e6a0a8a89ea7..c2469833b4fb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -335,25 +335,6 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_heightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
- sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_dinode);
- sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
- for (x = 2;; x++) {
- u64 space, d;
- u32 m;
-
- space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
- d = space;
- m = do_div(d, sdp->sd_inptrs);
-
- if (d != sdp->sd_jheightsize[x - 1] || m)
- break;
- sdp->sd_jheightsize[x] = space;
- }
- sdp->sd_max_jheight = x;
- sdp->sd_jheightsize[x] = ~0;
- gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-
sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_leaf)) /
GFS2_MIN_DIRENT_SIZE;
@@ -825,7 +806,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
goto fail_rindex;
}
/*
- * i_mutex on quota files is special. Since this inode is hidden system
+ * i_rwsem on quota files is special. Since this inode is hidden system
* file, we are safe to define locking ourselves.
*/
lockdep_set_class(&sdp->sd_quota_inode->i_rwsem,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 7a98abd340ee..e8585dfd209f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,7 +735,10 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
if (!buffer_uptodate(bh))
goto unlock_out;
}
- gfs2_trans_add_data(ip->i_gl, bh);
+ if (gfs2_is_jdata(ip))
+ gfs2_trans_add_data(ip->i_gl, bh);
+ else
+ gfs2_ordered_add_inode(ip);
/* If we need to write to the next block as well */
if (to_write > (bsize - boff)) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b683917a27e..6bc5cfe710d1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -372,8 +372,8 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
start = bi->bi_bh->b_data;
if (bi->bi_clone)
start = bi->bi_clone;
- end = start + bi->bi_bh->b_size;
start += bi->bi_offset;
+ end = start + bi->bi_len;
BUG_ON(rbm.offset & 3);
start += (rbm.offset / GFS2_NBBY);
bytes = min_t(u32, len / GFS2_NBBY, (end - start));
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 620be0521866..cf5c7f3080d2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -800,7 +800,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
- if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
+ if (!(flags & I_DIRTY_INODE))
return;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
return;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index c75cacaa349b..064c9a0ef046 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -143,32 +143,21 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
* @gl: The inode glock associated with the buffer
* @bh: The buffer to add
*
- * This is used in two distinct cases:
- * i) In ordered write mode
- * We put the data buffer on a list so that we can ensure that it's
- * synced to disk at the right time
- * ii) In journaled data mode
- * We need to journal the data block in the same way as metadata in
- * the functions above. The difference is that here we have a tag
- * which is two __be64's being the block number (as per meta data)
- * and a flag which says whether the data block needs escaping or
- * not. This means we need a new log entry for each 251 or so data
- * blocks, which isn't an enormous overhead but twice as much as
- * for normal metadata blocks.
+ * This is used in journaled data mode.
+ * We need to journal the data block in the same way as metadata in
+ * the functions above. The difference is that here we have a tag
+ * which is two __be64's being the block number (as per meta data)
+ * and a flag which says whether the data block needs escaping or
+ * not. This means we need a new log entry for each 251 or so data
+ * blocks, which isn't an enormous overhead but twice as much as
+ * for normal metadata blocks.
*/
void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
{
struct gfs2_trans *tr = current->journal_info;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct address_space *mapping = bh->b_page->mapping;
- struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_bufdata *bd;
- if (!gfs2_is_jdata(ip)) {
- gfs2_ordered_add_inode(ip);
- return;
- }
-
lock_buffer(bh);
if (buffer_pinned(bh)) {
set_bit(TR_TOUCHED, &tr->tr_flags);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 05de20954659..f2bce1e0f6fb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -308,7 +308,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
}
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(sdp);
@@ -768,7 +768,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out_end_trans;
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
out_end_trans:
gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -896,7 +896,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
ea_set_remove_stuffed(ip, es->es_el);
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
return error;
@@ -1114,7 +1114,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
}
ip->i_inode.i_ctime = current_time(&ip->i_inode);
- __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 75b254280ff6..3bf2ae0e467c 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -31,21 +31,15 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
res = hfs_brec_read(&fd, &rec, sizeof(rec));
if (res) {
- hfs_find_exit(&fd);
- if (res == -ENOENT) {
- /* No such entry */
- inode = NULL;
- goto done;
- }
- return ERR_PTR(res);
+ if (res != -ENOENT)
+ inode = ERR_PTR(res);
+ } else {
+ inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
+ if (!inode)
+ inode = ERR_PTR(-EACCES);
}
- inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
hfs_find_exit(&fd);
- if (!inode)
- return ERR_PTR(-EACCES);
-done:
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
/*
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2538b49cc349..b3309b83371a 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -543,9 +543,9 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
igrab(dir);
hlist_add_fake(&inode->i_hash);
mark_inode_dirty(inode);
+ dont_mount(dentry);
out:
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
void hfs_evict_inode(struct inode *inode)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 15e06fb552da..b5254378f011 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -122,8 +122,7 @@ again:
if (S_ISREG(inode->i_mode))
HFSPLUS_I(inode)->linkid = linkid;
out:
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
fail:
hfs_find_exit(&fd);
return ERR_PTR(err);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 513c357c734b..a6c0f54c48c3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
return 0;
out_put_hidden_dir:
+ cancel_delayed_work_sync(&sbi->sync_work);
iput(sbi->hidden_dir);
out_put_root:
dput(sb->s_root);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9a254dcc0e7..d508c7844681 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -138,10 +138,14 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
/*
* page based offset in vm_pgoff could be sufficiently large to
- * overflow a (l)off_t when converted to byte offset.
+ * overflow a loff_t when converted to byte offset. This can
+ * only happen on architectures where sizeof(loff_t) ==
+ * sizeof(unsigned long). So, only check in those instances.
*/
- if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
- return -EINVAL;
+ if (sizeof(unsigned long) == sizeof(loff_t)) {
+ if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+ return -EINVAL;
+ }
/* must be huge page aligned */
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
diff --git a/fs/inode.c b/fs/inode.c
index b153aeaa61ea..3b55391072f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -178,6 +178,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
+ mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
@@ -348,8 +349,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
- INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
- spin_lock_init(&mapping->tree_lock);
+ INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
@@ -504,14 +504,14 @@ EXPORT_SYMBOL(__remove_inode_hash);
void clear_inode(struct inode *inode)
{
/*
- * We have to cycle tree_lock here because reclaim can be still in the
+ * We have to cycle the i_pages lock here because reclaim can be in the
* process of removing the last page (in __delete_from_page_cache())
- * and we must not free mapping under it.
+ * and we must not free the mapping under it.
*/
- spin_lock_irq(&inode->i_data.tree_lock);
+ xa_lock_irq(&inode->i_data.i_pages);
BUG_ON(inode->i_data.nrpages);
BUG_ON(inode->i_data.nrexceptional);
- spin_unlock_irq(&inode->i_data.tree_lock);
+ xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4823431d1c9d..b445b13fc59b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -549,7 +549,7 @@ static int ioctl_fsfreeze(struct file *filp)
{
struct super_block *sb = file_inode(filp)->i_sb;
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* If filesystem doesn't support freeze feature, return. */
@@ -566,7 +566,7 @@ static int ioctl_fsthaw(struct file *filp)
{
struct super_block *sb = file_inode(filp)->i_sb;
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* Thaw */
diff --git a/fs/iomap.c b/fs/iomap.c
index afd163586aa0..206539d369a8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/backing-dev.h>
@@ -27,6 +28,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include <linux/sched/signal.h>
+#include <linux/swap.h>
#include "internal.h"
@@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
return written ? written : ret;
}
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
{
- sector_t sector = (iomap->addr +
- (pos & PAGE_MASK) - iomap->offset) >> 9;
-
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
- offset, bytes);
+ return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+ iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
}
static loff_t
@@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
case IOMAP_DELALLOC:
flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
break;
+ case IOMAP_MAPPED:
+ break;
case IOMAP_UNWRITTEN:
flags |= FIEMAP_EXTENT_UNWRITTEN;
break;
- case IOMAP_MAPPED:
+ case IOMAP_INLINE:
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
break;
}
@@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_MERGED;
if (iomap->flags & IOMAP_F_SHARED)
flags |= FIEMAP_EXTENT_SHARED;
- if (iomap->flags & IOMAP_F_DATA_INLINE)
- flags |= FIEMAP_EXTENT_DATA_INLINE;
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ * Returns true if found and updates @lastoff to the offset in file.
+ */
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+ int whence)
+{
+ const struct address_space_operations *ops = inode->i_mapping->a_ops;
+ unsigned int bsize = i_blocksize(inode), off;
+ bool seek_data = whence == SEEK_DATA;
+ loff_t poff = page_offset(page);
+
+ if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+ return false;
+
+ if (*lastoff < poff) {
+ /*
+ * Last offset smaller than the start of the page means we found
+ * a hole:
+ */
+ if (whence == SEEK_HOLE)
+ return true;
+ *lastoff = poff;
+ }
+
+ /*
+ * Just check the page unless we can and should check block ranges:
+ */
+ if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
+ return PageUptodate(page) == seek_data;
+
+ lock_page(page);
+ if (unlikely(page->mapping != inode->i_mapping))
+ goto out_unlock_not_found;
+
+ for (off = 0; off < PAGE_SIZE; off += bsize) {
+ if ((*lastoff & ~PAGE_MASK) >= off + bsize)
+ continue;
+ if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+ unlock_page(page);
+ return true;
+ }
+ *lastoff = poff + off + bsize;
+ }
+
+out_unlock_not_found:
+ unlock_page(page);
+ return false;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+ int whence)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ loff_t lastoff = offset;
+ struct pagevec pvec;
+
+ if (length <= 0)
+ return -ENOENT;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ end - 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page_seek_hole_data(inode, page, &lastoff, whence))
+ goto check_range;
+ lastoff = page_offset(page) + PAGE_SIZE;
+ }
+ pagevec_release(&pvec);
+ } while (index < end);
+
+ /* When no page at lastoff and we are not done, we found a hole. */
+ if (whence != SEEK_HOLE)
+ goto not_found;
+
+check_range:
+ if (lastoff < offset + length)
+ goto out;
+not_found:
+ lastoff = -ENOENT;
+out:
+ pagevec_release(&pvec);
+ return lastoff;
+}
+
+
static loff_t
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
void *data, struct iomap *iomap)
@@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_WRITE_FUA (1 << 28)
+#define IOMAP_DIO_NEED_SYNC (1 << 29)
#define IOMAP_DIO_WRITE (1 << 30)
#define IOMAP_DIO_DIRTY (1 << 31)
@@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
dio_warn_stale_pagecache(iocb->ki_filp);
}
+ /*
+ * If this is a DSYNC write, make sure we push it to stable storage now
+ * that we've written data.
+ */
+ if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+ ret = generic_write_sync(iocb, ret);
+
inode_dio_end(file_inode(iocb->ki_filp));
kfree(dio);
@@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- bool is_write = (dio->flags & IOMAP_DIO_WRITE);
- ssize_t ret;
- ret = iomap_dio_complete(dio);
- if (is_write && ret > 0)
- ret = generic_write_sync(iocb, ret);
- iocb->ki_complete(iocb, ret, 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
}
/*
@@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector =
- (iomap->addr + pos - iomap->offset) >> 9;
+ bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
- if (bio_add_page(bio, page, len, 0) != len)
- BUG();
+ __bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
atomic_inc(&dio->ref);
@@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iov_iter iter;
struct bio *bio;
bool need_zeroout = false;
+ bool use_fua = false;
int nr_pages, ret;
size_t copied = 0;
@@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
case IOMAP_MAPPED:
if (iomap->flags & IOMAP_F_SHARED)
dio->flags |= IOMAP_DIO_COW;
- if (iomap->flags & IOMAP_F_NEW)
+ if (iomap->flags & IOMAP_F_NEW) {
need_zeroout = true;
+ } else {
+ /*
+ * Use a FUA write if we need datasync semantics, this
+ * is a pure data IO that doesn't require any metadata
+ * updates and the underlying device supports FUA. This
+ * allows us to avoid cache flushes on IO completion.
+ */
+ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+ (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+ blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ use_fua = true;
+ }
break;
default:
WARN_ON_ONCE(1);
@@ -916,8 +1044,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector =
- (iomap->addr + pos - iomap->offset) >> 9;
+ bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
@@ -930,10 +1057,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+ if (use_fua)
+ bio->bi_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_FUA;
task_io_account_write(n);
} else {
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
if (dio->flags & IOMAP_DIO_DIRTY)
bio_set_pages_dirty(bio);
}
@@ -961,6 +1092,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
return copied;
}
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not. This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
@@ -1005,8 +1145,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iter->type == ITER_IOVEC)
dio->flags |= IOMAP_DIO_DIRTY;
} else {
- dio->flags |= IOMAP_DIO_WRITE;
flags |= IOMAP_WRITE;
+ dio->flags |= IOMAP_DIO_WRITE;
+
+ /* for data sync or sync, we need sync completion processing */
+ if (iocb->ki_flags & IOCB_DSYNC)
+ dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+ /*
+ * For datasync only writes, we optimistically try using FUA for
+ * this IO. Any non-FUA write that occurs will clear this flag,
+ * hence we know before completion whether a cache flush is
+ * necessary.
+ */
+ if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+ dio->flags |= IOMAP_DIO_WRITE_FUA;
}
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -1062,6 +1215,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret < 0)
iomap_dio_set_error(dio, ret);
+ /*
+ * If all the writes we issued were FUA, we don't need to flush the
+ * cache on IO completion. Clear the sync flag for this case.
+ */
+ if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
if (!atomic_dec_and_test(&dio->ref)) {
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
@@ -1089,3 +1249,203 @@ out_free_dio:
return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+/* Swapfile activation */
+
+#ifdef CONFIG_SWAP
+struct iomap_swapfile_info {
+ struct iomap iomap; /* accumulated iomap */
+ struct swap_info_struct *sis;
+ uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
+ uint64_t highest_ppage; /* highest physical addr seen (pages) */
+ unsigned long nr_pages; /* number of pages collected */
+ int nr_extents; /* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file. Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary. The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+ struct iomap *iomap = &isi->iomap;
+ unsigned long nr_pages;
+ uint64_t first_ppage;
+ uint64_t first_ppage_reported;
+ uint64_t next_ppage;
+ int error;
+
+ /*
+ * Round the start up and the end down so that the physical
+ * extent aligns to a page boundary.
+ */
+ first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+ PAGE_SHIFT;
+
+ /* Skip too-short physical extents. */
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
+
+ /*
+ * Calculate how much swap space we're adding; the first page contains
+ * the swap header and doesn't count. The mm still wants that first
+ * page fed to add_swap_extent, however.
+ */
+ first_ppage_reported = first_ppage;
+ if (iomap->offset == 0)
+ first_ppage_reported++;
+ if (isi->lowest_ppage > first_ppage_reported)
+ isi->lowest_ppage = first_ppage_reported;
+ if (isi->highest_ppage < (next_ppage - 1))
+ isi->highest_ppage = next_ppage - 1;
+
+ /* Add extent, set up for the next call. */
+ error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+ if (error < 0)
+ return error;
+ isi->nr_extents += error;
+ isi->nr_pages += nr_pages;
+ return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file. We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+ loff_t count, void *data, struct iomap *iomap)
+{
+ struct iomap_swapfile_info *isi = data;
+ int error;
+
+ switch (iomap->type) {
+ case IOMAP_MAPPED:
+ case IOMAP_UNWRITTEN:
+ /* Only real or unwritten extents. */
+ break;
+ case IOMAP_INLINE:
+ /* No inline data. */
+ pr_err("swapon: file is inline\n");
+ return -EINVAL;
+ default:
+ pr_err("swapon: file has unallocated extents\n");
+ return -EINVAL;
+ }
+
+ /* No uncommitted metadata or shared blocks. */
+ if (iomap->flags & IOMAP_F_DIRTY) {
+ pr_err("swapon: file is not committed\n");
+ return -EINVAL;
+ }
+ if (iomap->flags & IOMAP_F_SHARED) {
+ pr_err("swapon: file has shared extents\n");
+ return -EINVAL;
+ }
+
+ /* Only one bdev per swap file. */
+ if (iomap->bdev != isi->sis->bdev) {
+ pr_err("swapon: file is on multiple devices\n");
+ return -EINVAL;
+ }
+
+ if (isi->iomap.length == 0) {
+ /* No accumulated extent, so just store it. */
+ memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+ } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+ /* Append this to the accumulated extent. */
+ isi->iomap.length += iomap->length;
+ } else {
+ /* Otherwise, add the retained iomap and store this one. */
+ error = iomap_swapfile_add_extent(isi);
+ if (error)
+ return error;
+ memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+ }
+ return count;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *pagespan,
+ const struct iomap_ops *ops)
+{
+ struct iomap_swapfile_info isi = {
+ .sis = sis,
+ .lowest_ppage = (sector_t)-1ULL,
+ };
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = 0;
+ loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+ loff_t ret;
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+ ops, &isi, iomap_swapfile_activate_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ if (isi.iomap.length) {
+ ret = iomap_swapfile_add_extent(&isi);
+ if (ret)
+ return ret;
+ }
+
+ *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+ sis->max = isi.nr_pages;
+ sis->pages = isi.nr_pages - 1;
+ sis->highest_bit = isi.nr_pages - 1;
+ return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+#endif /* CONFIG_SWAP */
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ sector_t *bno = data, addr;
+
+ if (iomap->type == IOMAP_MAPPED) {
+ addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+ if (addr > INT_MAX)
+ WARN(1, "would truncate bmap result\n");
+ else
+ *bno = addr;
+ }
+ return 0;
+}
+
+/* legacy ->bmap interface. 0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+ const struct iomap_ops *ops)
+{
+ struct inode *inode = mapping->host;
+ loff_t pos = bno >> inode->i_blkbits;
+ unsigned blocksize = i_blocksize(inode);
+
+ if (filemap_write_and_wait(mapping))
+ return 0;
+
+ bno = 0;
+ iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+ return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 9bb2fe35799d..10205ececc27 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/bio.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/zlib.h>
@@ -59,7 +60,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
>> bufshift;
int haveblocks;
blkcnt_t blocknum;
- struct buffer_head *bhs[needblocks + 1];
+ struct buffer_head **bhs;
int curbh, curpage;
if (block_size > deflateBound(1UL << zisofs_block_shift)) {
@@ -80,7 +81,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
/* Because zlib is not thread-safe, do all the I/O at the top. */
blocknum = block_start >> bufshift;
- memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *));
+ bhs = kcalloc(needblocks + 1, sizeof(*bhs), GFP_KERNEL);
+ if (!bhs) {
+ *errp = -ENOMEM;
+ return 0;
+ }
haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs);
@@ -190,6 +195,7 @@ z_eio:
b_eio:
for (i = 0; i < haveblocks; i++)
brelse(bhs[i]);
+ kfree(bhs);
return stream.total_out;
}
@@ -305,7 +311,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
unsigned int zisofs_pages_per_cblock =
PAGE_SHIFT <= zisofs_block_shift ?
(1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
- struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
+ struct page **pages;
pgoff_t index = page->index, end_index;
end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -330,6 +336,12 @@ static int zisofs_readpage(struct file *file, struct page *page)
full_page = 0;
pcount = 1;
}
+ pages = kcalloc(max_t(unsigned int, zisofs_pages_per_cblock, 1),
+ sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ unlock_page(page);
+ return -ENOMEM;
+ }
pages[full_page] = page;
for (i = 0; i < pcount; i++, index++) {
@@ -357,6 +369,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
}
/* At this point, err contains 0 or -EIO depending on the "critical" page */
+ kfree(pages);
return err;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bc258a4402f6..ec3fba7d492f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -394,7 +394,10 @@ static int parse_options(char *options, struct iso9660_options *popt)
break;
#ifdef CONFIG_JOLIET
case Opt_iocharset:
+ kfree(popt->iocharset);
popt->iocharset = match_strdup(&args[0]);
+ if (!popt->iocharset)
+ return 0;
break;
#endif
case Opt_map_a:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index dfb057900e79..8ef6b6daaa7a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -114,7 +114,7 @@ void __jbd2_debug(int level, const char *file, const char *func,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
va_end(args);
}
EXPORT_SYMBOL(__jbd2_debug);
@@ -2302,8 +2302,7 @@ static void jbd2_journal_destroy_slabs(void)
int i;
for (i = 0; i < JBD2_MAX_SLABS; i++) {
- if (jbd2_slab[i])
- kmem_cache_destroy(jbd2_slab[i]);
+ kmem_cache_destroy(jbd2_slab[i]);
jbd2_slab[i] = NULL;
}
}
@@ -2404,10 +2403,8 @@ static int jbd2_journal_init_journal_head_cache(void)
static void jbd2_journal_destroy_journal_head_cache(void)
{
- if (jbd2_journal_head_cache) {
- kmem_cache_destroy(jbd2_journal_head_cache);
- jbd2_journal_head_cache = NULL;
- }
+ kmem_cache_destroy(jbd2_journal_head_cache);
+ jbd2_journal_head_cache = NULL;
}
/*
@@ -2665,11 +2662,10 @@ static int __init jbd2_journal_init_handle_cache(void)
static void jbd2_journal_destroy_handle_cache(void)
{
- if (jbd2_handle_cache)
- kmem_cache_destroy(jbd2_handle_cache);
- if (jbd2_inode_cache)
- kmem_cache_destroy(jbd2_inode_cache);
-
+ kmem_cache_destroy(jbd2_handle_cache);
+ jbd2_handle_cache = NULL;
+ kmem_cache_destroy(jbd2_inode_cache);
+ jbd2_inode_cache = NULL;
}
/*
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 696ef15ec942..240779e4689c 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -180,14 +180,10 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
void jbd2_journal_destroy_revoke_caches(void)
{
- if (jbd2_revoke_record_cache) {
- kmem_cache_destroy(jbd2_revoke_record_cache);
- jbd2_revoke_record_cache = NULL;
- }
- if (jbd2_revoke_table_cache) {
- kmem_cache_destroy(jbd2_revoke_table_cache);
- jbd2_revoke_table_cache = NULL;
- }
+ kmem_cache_destroy(jbd2_revoke_record_cache);
+ jbd2_revoke_record_cache = NULL;
+ kmem_cache_destroy(jbd2_revoke_table_cache);
+ jbd2_revoke_table_cache = NULL;
}
int __init jbd2_journal_init_revoke_caches(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ac311037d7a5..51dd68e67b0f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -49,10 +49,8 @@ int __init jbd2_journal_init_transaction_cache(void)
void jbd2_journal_destroy_transaction_cache(void)
{
- if (transaction_cache) {
- kmem_cache_destroy(transaction_cache);
- transaction_cache = NULL;
- }
+ kmem_cache_destroy(transaction_cache);
+ transaction_cache = NULL;
}
void jbd2_journal_free_transaction(transaction_t *transaction)
@@ -532,6 +530,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
*/
ret = start_this_handle(journal, handle, GFP_NOFS);
if (ret < 0) {
+ handle->h_journal = journal;
jbd2_journal_free_reserved(handle);
return ret;
}
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 0a754f38462e..e5a6deb38e1e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -209,8 +209,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
__func__, inode->i_ino, inode->i_mode, inode->i_nlink,
f->inocache->pino_nlink, inode->i_mapping->nrpages);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
fail:
@@ -430,8 +429,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
mutex_unlock(&dir_f->sem);
jffs2_complete_reservation(c);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
fail:
@@ -575,8 +573,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode
mutex_unlock(&dir_f->sem);
jffs2_complete_reservation(c);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
fail:
@@ -747,8 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
mutex_unlock(&dir_f->sem);
jffs2_complete_reservation(c);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
fail:
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 4a6cf289be24..83b8f06b4a64 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -21,14 +21,6 @@
#include <linux/pagemap.h>
#include "nodelist.h"
-struct erase_priv_struct {
- struct jffs2_eraseblock *jeb;
- struct jffs2_sb_info *c;
-};
-
-#ifndef __ECOS
-static void jffs2_erase_callback(struct erase_info *);
-#endif
static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
@@ -51,7 +43,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n",
__func__,
jeb->offset, jeb->offset, jeb->offset + c->sector_size);
- instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL);
+ instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL);
if (!instr) {
pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
mutex_lock(&c->erase_free_sem);
@@ -67,18 +59,15 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
memset(instr, 0, sizeof(*instr));
- instr->mtd = c->mtd;
instr->addr = jeb->offset;
instr->len = c->sector_size;
- instr->callback = jffs2_erase_callback;
- instr->priv = (unsigned long)(&instr[1]);
-
- ((struct erase_priv_struct *)instr->priv)->jeb = jeb;
- ((struct erase_priv_struct *)instr->priv)->c = c;
ret = mtd_erase(c->mtd, instr);
- if (!ret)
+ if (!ret) {
+ jffs2_erase_succeeded(c, jeb);
+ kfree(instr);
return;
+ }
bad_offset = instr->fail_addr;
kfree(instr);
@@ -214,22 +203,6 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
wake_up(&c->erase_wait);
}
-#ifndef __ECOS
-static void jffs2_erase_callback(struct erase_info *instr)
-{
- struct erase_priv_struct *priv = (void *)instr->priv;
-
- if(instr->state != MTD_ERASE_DONE) {
- pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
- (unsigned long long)instr->addr, instr->state);
- jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
- } else {
- jffs2_erase_succeeded(priv->c, priv->jeb);
- }
- kfree(instr);
-}
-#endif /* !__ECOS */
-
/* Hmmm. Maybe we should accept the extra space it takes and make
this a standard doubly-linked list? */
static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f60dee7faf03..87bdf0f4cba1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -342,7 +342,7 @@ static void jffs2_put_super (struct super_block *sb)
static void jffs2_kill_sb(struct super_block *sb)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
- if (!sb_rdonly(sb))
+ if (c && !sb_rdonly(sb))
jffs2_stop_garbage_collect_thread(c);
kill_mtd_super(sb);
kfree(c);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index a70907606025..35a5b2a81ae0 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -29,7 +29,6 @@
#ifdef PROC_FS_JFS /* see jfs_debug.h */
-static struct proc_dir_entry *base;
#ifdef CONFIG_JFS_DEBUG
static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
{
@@ -66,43 +65,29 @@ static const struct file_operations jfs_loglevel_proc_fops = {
};
#endif
-static struct {
- const char *name;
- const struct file_operations *proc_fops;
-} Entries[] = {
-#ifdef CONFIG_JFS_STATISTICS
- { "lmstats", &jfs_lmstats_proc_fops, },
- { "txstats", &jfs_txstats_proc_fops, },
- { "xtstat", &jfs_xtstat_proc_fops, },
- { "mpstat", &jfs_mpstat_proc_fops, },
-#endif
-#ifdef CONFIG_JFS_DEBUG
- { "TxAnchor", &jfs_txanchor_proc_fops, },
- { "loglevel", &jfs_loglevel_proc_fops }
-#endif
-};
-#define NPROCENT ARRAY_SIZE(Entries)
-
void jfs_proc_init(void)
{
- int i;
+ struct proc_dir_entry *base;
- if (!(base = proc_mkdir("fs/jfs", NULL)))
+ base = proc_mkdir("fs/jfs", NULL);
+ if (!base)
return;
- for (i = 0; i < NPROCENT; i++)
- proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
+#ifdef CONFIG_JFS_STATISTICS
+ proc_create_single("lmstats", 0, base, jfs_lmstats_proc_show);
+ proc_create_single("txstats", 0, base, jfs_txstats_proc_show);
+ proc_create_single("xtstat", 0, base, jfs_xtstat_proc_show);
+ proc_create_single("mpstat", 0, base, jfs_mpstat_proc_show);
+#endif
+#ifdef CONFIG_JFS_DEBUG
+ proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show);
+ proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops);
+#endif
}
void jfs_proc_clean(void)
{
- int i;
-
- if (base) {
- for (i = 0; i < NPROCENT; i++)
- remove_proc_entry(Entries[i].name, base);
- remove_proc_entry("fs/jfs", NULL);
- }
+ remove_proc_subtree("fs/jfs", NULL);
}
#endif /* PROC_FS_JFS */
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index eafd1300a00b..0d9e35da8462 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
extern int jfsloglevel;
-extern const struct file_operations jfs_txanchor_proc_fops;
+int jfs_txanchor_proc_show(struct seq_file *m, void *v);
/* information message: e.g., configuration, major event */
#define jfs_info(fmt, arg...) do { \
@@ -105,10 +105,10 @@ extern const struct file_operations jfs_txanchor_proc_fops;
* ----------
*/
#ifdef CONFIG_JFS_STATISTICS
-extern const struct file_operations jfs_lmstats_proc_fops;
-extern const struct file_operations jfs_txstats_proc_fops;
-extern const struct file_operations jfs_mpstat_proc_fops;
-extern const struct file_operations jfs_xtstat_proc_fops;
+int jfs_lmstats_proc_show(struct seq_file *m, void *v);
+int jfs_txstats_proc_show(struct seq_file *m, void *v);
+int jfs_mpstat_proc_show(struct seq_file *m, void *v);
+int jfs_xtstat_proc_show(struct seq_file *m, void *v);
#define INCREMENT(x) ((x)++)
#define DECREMENT(x) ((x)--)
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 0e5d412c0b01..6b68df395892 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2493,7 +2493,7 @@ exit:
}
#ifdef CONFIG_JFS_STATISTICS
-static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
+int jfs_lmstats_proc_show(struct seq_file *m, void *v)
{
seq_printf(m,
"JFS Logmgr stats\n"
@@ -2510,16 +2510,4 @@ static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
lmStat.partial_page);
return 0;
}
-
-static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, jfs_lmstats_proc_show, NULL);
-}
-
-const struct file_operations jfs_lmstats_proc_fops = {
- .open = jfs_lmstats_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1a3b0cc22ad3..fa2c6824c7f2 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -815,7 +815,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
}
#ifdef CONFIG_JFS_STATISTICS
-static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
+int jfs_mpstat_proc_show(struct seq_file *m, void *v)
{
seq_printf(m,
"JFS Metapage statistics\n"
@@ -828,16 +828,4 @@ static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
mpStat.lockwait);
return 0;
}
-
-static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, jfs_mpstat_proc_show, NULL);
-}
-
-const struct file_operations jfs_mpstat_proc_fops = {
- .open = jfs_mpstat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 4d973524c887..a5663cb621d8 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2998,7 +2998,7 @@ int jfs_sync(void *arg)
}
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
+int jfs_txanchor_proc_show(struct seq_file *m, void *v)
{
char *freewait;
char *freelockwait;
@@ -3032,22 +3032,10 @@ static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
return 0;
}
-
-static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, jfs_txanchor_proc_show, NULL);
-}
-
-const struct file_operations jfs_txanchor_proc_fops = {
- .open = jfs_txanchor_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-static int jfs_txstats_proc_show(struct seq_file *m, void *v)
+int jfs_txstats_proc_show(struct seq_file *m, void *v)
{
seq_printf(m,
"JFS TxStats\n"
@@ -3072,16 +3060,4 @@ static int jfs_txstats_proc_show(struct seq_file *m, void *v)
TxStat.txLockAlloc_freelock);
return 0;
}
-
-static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, jfs_txstats_proc_show, NULL);
-}
-
-const struct file_operations jfs_txstats_proc_fops = {
- .open = jfs_txstats_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5cde6d2fcfca..2c200b5256a6 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3874,7 +3874,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
}
#ifdef CONFIG_JFS_STATISTICS
-static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
+int jfs_xtstat_proc_show(struct seq_file *m, void *v)
{
seq_printf(m,
"JFS Xtree statistics\n"
@@ -3887,16 +3887,4 @@ static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
xtStat.split);
return 0;
}
-
-static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, jfs_xtstat_proc_show, NULL);
-}
-
-const struct file_operations jfs_xtstat_proc_fops = {
- .open = jfs_xtstat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b41596d71858..56c3fcbfe80e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -178,8 +178,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
unlock_new_inode(ip);
iput(ip);
} else {
- unlock_new_inode(ip);
- d_instantiate(dentry, ip);
+ d_instantiate_new(dentry, ip);
}
out2:
@@ -313,8 +312,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
unlock_new_inode(ip);
iput(ip);
} else {
- unlock_new_inode(ip);
- d_instantiate(dentry, ip);
+ d_instantiate_new(dentry, ip);
}
out2:
@@ -1059,8 +1057,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
unlock_new_inode(ip);
iput(ip);
} else {
- unlock_new_inode(ip);
- d_instantiate(dentry, ip);
+ d_instantiate_new(dentry, ip);
}
out2:
@@ -1447,8 +1444,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
unlock_new_inode(ip);
iput(ip);
} else {
- unlock_new_inode(ip);
- d_instantiate(dentry, ip);
+ d_instantiate_new(dentry, ip);
}
out1:
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index fd5ce883072e..2015d8c45e4a 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -348,11 +348,11 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
kernfs_put_active(of->kn);
}
-static int kernfs_vma_fault(struct vm_fault *vmf)
+static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
- int ret;
+ vm_fault_t ret;
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
@@ -368,11 +368,11 @@ static int kernfs_vma_fault(struct vm_fault *vmf)
return ret;
}
-static int kernfs_vma_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
- int ret;
+ vm_fault_t ret;
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 26dd9a50f383..ff2716f9322e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -316,6 +316,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
info->root = root;
info->ns = ns;
+ INIT_LIST_HEAD(&info->node);
sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
&init_user_ns, info);
diff --git a/fs/libfs.c b/fs/libfs.c
index 7ff3cb904acd..0fb590d79f30 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
+int noop_set_page_dirty(struct page *page)
+{
+ /*
+ * Unlike __set_page_dirty_no_writeback that handles dirty page
+ * tracking in the page object, dax does all dirty tracking in
+ * the inode address_space in response to mkwrite faults. In the
+ * dax case we only need to worry about potentially dirty CPU
+ * caches, not dirty page cache pages to write back.
+ *
+ * This callback is defined to prevent fallback to
+ * __set_page_dirty_buffers() in set_page_dirty().
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(noop_set_page_dirty);
+
+void noop_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ /*
+ * There is no page cache to invalidate in the dax case, however
+ * we need this callback defined to prevent falling back to
+ * block_invalidatepage() in do_invalidatepage().
+ */
+}
+EXPORT_SYMBOL_GPL(noop_invalidatepage);
+
+ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ /*
+ * iomap based filesystems support direct I/O without need for
+ * this callback. However, it still needs to be set in
+ * inode->a_ops so that open/fcntl know that direct I/O is
+ * generally supported.
+ */
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(noop_direct_IO);
+
/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
diff --git a/fs/locks.c b/fs/locks.c
index 62bbe8b31f26..05e211be8684 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2788,22 +2788,10 @@ static const struct seq_operations locks_seq_operations = {
.show = locks_show,
};
-static int locks_open(struct inode *inode, struct file *filp)
-{
- return seq_open_private(filp, &locks_seq_operations,
- sizeof(struct locks_iterator));
-}
-
-static const struct file_operations proc_locks_operations = {
- .open = locks_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
static int __init proc_locks_init(void)
{
- proc_create("locks", 0, NULL, &proc_locks_operations);
+ proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
+ sizeof(struct locks_iterator), NULL);
return 0;
}
fs_initcall(proc_locks_init);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ccf0f00030bf..1a6084d2b02e 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -28,13 +28,9 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
return ERR_PTR(-ENAMETOOLONG);
ino = minix_inode_by_name(dentry);
- if (ino) {
+ if (ino)
inode = minix_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
diff --git a/fs/namei.c b/fs/namei.c
index a09419379f5d..6df1f61855d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,6 +39,7 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
+#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -130,6 +131,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
+ BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
@@ -222,9 +224,10 @@ getname_kernel(const char * filename)
if (len <= EMBEDDED_NAME_MAX) {
result->name = (char *)result->iname;
} else if (len <= PATH_MAX) {
+ const size_t size = offsetof(struct filename, iname[1]);
struct filename *tmp;
- tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+ tmp = kmalloc(size, GFP_KERNEL);
if (unlikely(!tmp)) {
__putname(result);
return ERR_PTR(-ENOMEM);
@@ -927,7 +930,8 @@ static inline int may_follow_link(struct nameidata *nd)
if (nd->flags & LOOKUP_RCU)
return -ECHILD;
- audit_log_link_denied("follow_link", &nd->stack[0].link);
+ audit_inode(nd->name, nd->stack[0].link.dentry, 0);
+ audit_log_link_denied("follow_link");
return -EACCES;
}
@@ -980,20 +984,22 @@ static bool safe_hardlink_source(struct inode *inode)
*/
static int may_linkat(struct path *link)
{
- struct inode *inode;
+ struct inode *inode = link->dentry->d_inode;
+
+ /* Inode writeback is not safe when the uid or gid are invalid. */
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
if (!sysctl_protected_hardlinks)
return 0;
- inode = link->dentry->d_inode;
-
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
return 0;
- audit_log_link_denied("linkat", link);
+ audit_log_link_denied("linkat");
return -EPERM;
}
@@ -1434,10 +1440,8 @@ static int path_parent_directory(struct path *path)
static int follow_dotdot(struct nameidata *nd)
{
while(1) {
- if (nd->path.dentry == nd->root.dentry &&
- nd->path.mnt == nd->root.mnt) {
+ if (path_equal(&nd->path, &nd->root))
break;
- }
if (nd->path.dentry != nd->path.mnt->mnt_root) {
int ret = path_parent_directory(&nd->path);
if (ret)
@@ -1594,22 +1598,21 @@ static int lookup_fast(struct nameidata *nd,
}
/* Fast lookup failed, do it the slow way */
-static struct dentry *lookup_slow(const struct qstr *name,
- struct dentry *dir,
- unsigned int flags)
+static struct dentry *__lookup_slow(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
{
- struct dentry *dentry = ERR_PTR(-ENOENT), *old;
+ struct dentry *dentry, *old;
struct inode *inode = dir->d_inode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
- inode_lock_shared(inode);
/* Don't go there if it's already dead */
if (unlikely(IS_DEADDIR(inode)))
- goto out;
+ return ERR_PTR(-ENOENT);
again:
dentry = d_alloc_parallel(dir, name, &wq);
if (IS_ERR(dentry))
- goto out;
+ return dentry;
if (unlikely(!d_in_lookup(dentry))) {
if (!(flags & LOOKUP_NO_REVAL)) {
int error = d_revalidate(dentry, flags);
@@ -1631,11 +1634,21 @@ again:
dentry = old;
}
}
-out:
- inode_unlock_shared(inode);
return dentry;
}
+static struct dentry *lookup_slow(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
+{
+ struct inode *inode = dir->d_inode;
+ struct dentry *res;
+ inode_lock_shared(inode);
+ res = __lookup_slow(name, dir, flags);
+ inode_unlock_shared(inode);
+ return res;
+}
+
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
@@ -2418,56 +2431,63 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
}
EXPORT_SYMBOL(vfs_path_lookup);
-/**
- * lookup_one_len - filesystem helper to lookup single pathname component
- * @name: pathname component to lookup
- * @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
- *
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
- *
- * The caller must hold base->i_mutex.
- */
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+static int lookup_one_len_common(const char *name, struct dentry *base,
+ int len, struct qstr *this)
{
- struct qstr this;
- unsigned int c;
- int err;
-
- WARN_ON_ONCE(!inode_is_locked(base->d_inode));
-
- this.name = name;
- this.len = len;
- this.hash = full_name_hash(base, name, len);
+ this->name = name;
+ this->len = len;
+ this->hash = full_name_hash(base, name, len);
if (!len)
- return ERR_PTR(-EACCES);
+ return -EACCES;
if (unlikely(name[0] == '.')) {
if (len < 2 || (len == 2 && name[1] == '.'))
- return ERR_PTR(-EACCES);
+ return -EACCES;
}
while (len--) {
- c = *(const unsigned char *)name++;
+ unsigned int c = *(const unsigned char *)name++;
if (c == '/' || c == '\0')
- return ERR_PTR(-EACCES);
+ return -EACCES;
}
/*
* See if the low-level filesystem might want
* to use its own hash..
*/
if (base->d_flags & DCACHE_OP_HASH) {
- int err = base->d_op->d_hash(base, &this);
+ int err = base->d_op->d_hash(base, this);
if (err < 0)
- return ERR_PTR(err);
+ return err;
}
- err = inode_permission(base->d_inode, MAY_EXEC);
+ return inode_permission(base->d_inode, MAY_EXEC);
+}
+
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
+ */
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+ struct dentry *dentry;
+ struct qstr this;
+ int err;
+
+ WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+
+ err = lookup_one_len_common(name, base, len, &this);
if (err)
return ERR_PTR(err);
- return __lookup_hash(&this, base, 0);
+ dentry = lookup_dcache(&this, base, 0);
+ return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);
@@ -2487,37 +2507,10 @@ struct dentry *lookup_one_len_unlocked(const char *name,
struct dentry *base, int len)
{
struct qstr this;
- unsigned int c;
int err;
struct dentry *ret;
- this.name = name;
- this.len = len;
- this.hash = full_name_hash(base, name, len);
- if (!len)
- return ERR_PTR(-EACCES);
-
- if (unlikely(name[0] == '.')) {
- if (len < 2 || (len == 2 && name[1] == '.'))
- return ERR_PTR(-EACCES);
- }
-
- while (len--) {
- c = *(const unsigned char *)name++;
- if (c == '/' || c == '\0')
- return ERR_PTR(-EACCES);
- }
- /*
- * See if the low-level filesystem might want
- * to use its own hash..
- */
- if (base->d_flags & DCACHE_OP_HASH) {
- int err = base->d_op->d_hash(base, &this);
- if (err < 0)
- return ERR_PTR(err);
- }
-
- err = inode_permission(base->d_inode, MAY_EXEC);
+ err = lookup_one_len_common(name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -2756,6 +2749,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
BUG_ON(!inode);
BUG_ON(victim->d_parent->d_inode != dir);
+
+ /* Inode writeback is not safe when the uid or gid are invalid. */
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
+
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
@@ -3684,7 +3682,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
if (error)
return error;
- if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+ !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
@@ -3859,11 +3858,11 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (error)
goto out;
- shrink_dcache_parent(dentry);
error = dir->i_op->rmdir(dir, dentry);
if (error)
goto out;
+ shrink_dcache_parent(dentry);
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
detach_mounts(dentry);
@@ -4446,8 +4445,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir->i_nlink >= max_links)
goto out;
}
- if (is_dir && !(flags & RENAME_EXCHANGE) && target)
- shrink_dcache_parent(new_dentry);
if (!is_dir) {
error = try_break_deleg(source, delegated_inode);
if (error)
@@ -4464,8 +4461,10 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
if (!(flags & RENAME_EXCHANGE) && target) {
- if (is_dir)
+ if (is_dir) {
+ shrink_dcache_parent(new_dentry);
target->i_flags |= S_DEAD;
+ }
dont_mount(new_dentry);
detach_mounts(new_dentry);
}
diff --git a/fs/namespace.c b/fs/namespace.c
index e398f32d7541..8ddd14806799 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1089,7 +1089,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
+ mnt->mnt.mnt_flags = old->mnt.mnt_flags;
+ mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
/* Don't allow unprivileged users to change mount flags */
if (flag & CL_UNPRIVILEGED) {
mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
@@ -1589,7 +1590,7 @@ static int do_umount(struct mount *mnt, int flags)
* Special case for "unmounting" root ...
* we just try to remount it readonly.
*/
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
down_write(&sb->s_umount);
if (!sb_rdonly(sb))
@@ -2332,7 +2333,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
down_write(&sb->s_umount);
if (ms_flags & MS_BIND)
err = change_mount_flags(path->mnt, ms_flags);
- else if (!capable(CAP_SYS_ADMIN))
+ else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
err = -EPERM;
else
err = do_remount_sb(sb, sb_flags, data, 0);
@@ -2814,7 +2815,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
- if (flags & SB_RDONLY)
+ if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
/* The default atime for remount is preservation */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 123c069429a7..a813979b5be0 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -535,35 +535,10 @@ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char
return 0;
}
-#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
-#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
-static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz)
{
- __be32 bm[2];
- __be32 *p;
-
- bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
- bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
- if (bm[1] != 0) {
- p = xdr_reserve_space(xdr, 16);
- if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);
- *p++ = htonl(2);
- *p++ = bm[0];
- *p++ = bm[1];
- } else if (bm[0] != 0) {
- p = xdr_reserve_space(xdr, 12);
- if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);
- *p++ = htonl(1);
- *p++ = bm[0];
- } else {
- p = xdr_reserve_space(xdr, 8);
- if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);
- *p++ = htonl(0);
- }
- *savep = p;
+ if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0)
+ return cpu_to_be32(NFS4ERR_RESOURCE);
return 0;
}
@@ -656,9 +631,13 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
if (unlikely(status != 0))
goto out;
- status = encode_attr_bitmap(xdr, res->bitmap, &savep);
+ status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap));
if (unlikely(status != 0))
goto out;
+ status = cpu_to_be32(NFS4ERR_RESOURCE);
+ savep = xdr_reserve_space(xdr, sizeof(*savep));
+ if (unlikely(!savep))
+ goto out;
status = encode_attr_change(xdr, res->bitmap, res->change_attr);
if (unlikely(status != 0))
goto out;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b9129e2befea..bbc91d7ca1bd 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1067,7 +1067,6 @@ void nfs_clients_init(struct net *net)
}
#ifdef CONFIG_PROC_FS
-static int nfs_server_list_open(struct inode *inode, struct file *file);
static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
static void nfs_server_list_stop(struct seq_file *p, void *v);
@@ -1080,14 +1079,6 @@ static const struct seq_operations nfs_server_list_ops = {
.show = nfs_server_list_show,
};
-static const struct file_operations nfs_server_list_fops = {
- .open = nfs_server_list_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-static int nfs_volume_list_open(struct inode *inode, struct file *file);
static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
static void nfs_volume_list_stop(struct seq_file *p, void *v);
@@ -1100,23 +1091,6 @@ static const struct seq_operations nfs_volume_list_ops = {
.show = nfs_volume_list_show,
};
-static const struct file_operations nfs_volume_list_fops = {
- .open = nfs_volume_list_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-/*
- * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
- * we're dealing
- */
-static int nfs_server_list_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &nfs_server_list_ops,
- sizeof(struct seq_net_private));
-}
-
/*
* set up the iterator to start reading from the server list and return the first item
*/
@@ -1185,15 +1159,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
}
/*
- * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
- */
-static int nfs_volume_list_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &nfs_volume_list_ops,
- sizeof(struct seq_net_private));
-}
-
-/*
* set up the iterator to start reading from the volume list and return the first item
*/
static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
@@ -1278,14 +1243,14 @@ int nfs_fs_proc_net_init(struct net *net)
goto error_0;
/* a file of servers with which we're dealing */
- p = proc_create("servers", S_IFREG|S_IRUGO,
- nn->proc_nfsfs, &nfs_server_list_fops);
+ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+ &nfs_server_list_ops, sizeof(struct seq_net_private));
if (!p)
goto error_1;
/* a file of volumes that we have mounted */
- p = proc_create("volumes", S_IFREG|S_IRUGO,
- nn->proc_nfsfs, &nfs_volume_list_fops);
+ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+ &nfs_volume_list_ops, sizeof(struct seq_net_private));
if (!p)
goto error_1;
return 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d8b47624fee2..1819d0d0ba4b 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -19,6 +19,7 @@
#include <linux/nfs_xdr.h>
#include "nfs4_fs.h"
+#include "nfs4session.h"
#include "delegation.h"
#include "internal.h"
#include "nfs4trace.h"
@@ -171,11 +172,15 @@ again:
* nfs_inode_reclaim_delegation - process a delegation reclaim request
* @inode: inode to process
* @cred: credential to use for request
- * @res: new delegation state from server
+ * @type: delegation type
+ * @stateid: delegation stateid
+ * @pagemod_limit: write delegation "space_limit"
*
*/
void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
- struct nfs_openres *res)
+ fmode_t type,
+ const nfs4_stateid *stateid,
+ unsigned long pagemod_limit)
{
struct nfs_delegation *delegation;
struct rpc_cred *oldcred = NULL;
@@ -185,9 +190,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
if (delegation != NULL) {
spin_lock(&delegation->lock);
if (delegation->inode != NULL) {
- nfs4_stateid_copy(&delegation->stateid, &res->delegation);
- delegation->type = res->delegation_type;
- delegation->pagemod_limit = res->pagemod_limit;
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
oldcred = delegation->cred;
delegation->cred = get_rpccred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -195,14 +200,14 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
spin_unlock(&delegation->lock);
rcu_read_unlock();
put_rpccred(oldcred);
- trace_nfs4_reclaim_delegation(inode, res->delegation_type);
+ trace_nfs4_reclaim_delegation(inode, type);
return;
}
/* We appear to have raced with a delegation return. */
spin_unlock(&delegation->lock);
}
rcu_read_unlock();
- nfs_inode_set_delegation(inode, cred, res);
+ nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit);
}
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -329,11 +334,16 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
* nfs_inode_set_delegation - set up a delegation on an inode
* @inode: inode to which delegation applies
* @cred: cred to use for subsequent delegation processing
- * @res: new delegation state from server
+ * @type: delegation type
+ * @stateid: delegation stateid
+ * @pagemod_limit: write delegation "space_limit"
*
* Returns zero on success, or a negative errno value.
*/
-int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
+ fmode_t type,
+ const nfs4_stateid *stateid,
+ unsigned long pagemod_limit)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_client *clp = server->nfs_client;
@@ -345,9 +355,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
if (delegation == NULL)
return -ENOMEM;
- nfs4_stateid_copy(&delegation->stateid, &res->delegation);
- delegation->type = res->delegation_type;
- delegation->pagemod_limit = res->pagemod_limit;
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
delegation->change_attr = inode_peek_iversion_raw(inode);
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
@@ -392,7 +402,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
- trace_nfs4_set_delegation(inode, res->delegation_type);
+ trace_nfs4_set_delegation(inode, type);
out:
spin_unlock(&clp->cl_lock);
@@ -547,6 +557,22 @@ int nfs4_inode_return_delegation(struct inode *inode)
return err;
}
+/**
+ * nfs4_inode_make_writeable
+ * @inode: pointer to inode
+ *
+ * Make the inode writeable by returning the delegation if necessary
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_inode_make_writeable(struct inode *inode)
+{
+ if (!nfs4_has_session(NFS_SERVER(inode)->nfs_client) ||
+ !nfs4_check_delegation(inode, FMODE_WRITE))
+ return nfs4_inode_return_delegation(inode);
+ return 0;
+}
+
static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
struct nfs_delegation *delegation)
{
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 185a09f37a89..bb1ef8c37af4 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -36,8 +36,10 @@ enum {
NFS_DELEGATION_TEST_EXPIRED,
};
-int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
-void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred,
+ fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+ fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
int nfs4_inode_return_delegation(struct inode *inode);
int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
void nfs_inode_return_delegation_noreclaim(struct inode *inode);
@@ -70,6 +72,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags);
bool nfs4_delegation_flush_on_close(const struct inode *inode);
void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
const nfs4_stateid *stateid);
+int nfs4_inode_make_writeable(struct inode *inode);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2f3f86726f5b..73f8b43d988c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1272,7 +1272,9 @@ static void nfs_drop_nlink(struct inode *inode)
/* drop the inode if we're reasonably sure this is the last link */
if (inode->i_nlink == 1)
clear_nlink(inode);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_OTHER;
spin_unlock(&inode->i_lock);
}
@@ -1798,12 +1800,11 @@ static int nfs_safe_remove(struct dentry *dentry)
trace_nfs_remove_enter(dir, dentry);
if (inode != NULL) {
- NFS_PROTO(inode)->return_delegation(inode);
- error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+ error = NFS_PROTO(dir)->remove(dir, dentry);
if (error == 0)
nfs_drop_nlink(inode);
} else
- error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+ error = NFS_PROTO(dir)->remove(dir, dentry);
if (error == -ENOENT)
nfs_dentry_handle_enoent(dentry);
trace_nfs_remove_exit(dir, dentry, error);
@@ -1932,8 +1933,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry, dentry);
trace_nfs_link_enter(inode, dir, dentry);
- NFS_PROTO(inode)->return_delegation(inode);
-
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
@@ -2023,10 +2022,6 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- NFS_PROTO(old_inode)->return_delegation(old_inode);
- if (new_inode != NULL)
- NFS_PROTO(new_inode)->return_delegation(new_inode);
-
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 0ee4b93d36ea..1c5d8d31fc0a 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -50,59 +50,6 @@ void nfs_fscache_unregister(void)
}
/*
- * Layout of the key for an NFS server cache object.
- */
-struct nfs_server_key {
- uint16_t nfsversion; /* NFS protocol version */
- uint16_t family; /* address family */
- uint16_t port; /* IP port */
- union {
- struct in_addr ipv4_addr; /* IPv4 address */
- struct in6_addr ipv6_addr; /* IPv6 address */
- } addr[0];
-};
-
-/*
- * Generate a key to describe a server in the main NFS index
- * - We return the length of the key, or 0 if we can't generate one
- */
-static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct nfs_client *clp = cookie_netfs_data;
- const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
- const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
- struct nfs_server_key *key = buffer;
- uint16_t len = sizeof(struct nfs_server_key);
-
- memset(key, 0, len);
- key->nfsversion = clp->rpc_ops->version;
- key->family = clp->cl_addr.ss_family;
-
- switch (clp->cl_addr.ss_family) {
- case AF_INET:
- key->port = sin->sin_port;
- key->addr[0].ipv4_addr = sin->sin_addr;
- len += sizeof(key->addr[0].ipv4_addr);
- break;
-
- case AF_INET6:
- key->port = sin6->sin6_port;
- key->addr[0].ipv6_addr = sin6->sin6_addr;
- len += sizeof(key->addr[0].ipv6_addr);
- break;
-
- default:
- printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
- clp->cl_addr.ss_family);
- len = 0;
- break;
- }
-
- return len;
-}
-
-/*
* Define the server object for FS-Cache. This is used to describe a server
* object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
* server address parameters.
@@ -110,33 +57,9 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
const struct fscache_cookie_def nfs_fscache_server_index_def = {
.name = "NFS.server",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = nfs_server_get_key,
};
/*
- * Generate a key to describe a superblock key in the main NFS index
- */
-static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct nfs_fscache_key *key;
- const struct nfs_server *nfss = cookie_netfs_data;
- uint16_t len;
-
- key = nfss->fscache_key;
- len = sizeof(key->key) + key->key.uniq_len;
- if (len > bufmax) {
- len = 0;
- } else {
- memcpy(buffer, &key->key, sizeof(key->key));
- memcpy(buffer + sizeof(key->key),
- key->key.uniquifier, key->key.uniq_len);
- }
-
- return len;
-}
-
-/*
* Define the superblock object for FS-Cache. This is used to describe a
* superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
* parameters that might cause a separate superblock.
@@ -144,84 +67,9 @@ static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
const struct fscache_cookie_def nfs_fscache_super_index_def = {
.name = "NFS.super",
.type = FSCACHE_COOKIE_TYPE_INDEX,
- .get_key = nfs_super_get_key,
};
/*
- * Definition of the auxiliary data attached to NFS inode storage objects
- * within the cache.
- *
- * The contents of this struct are recorded in the on-disk local cache in the
- * auxiliary data attached to the data storage object backing an inode. This
- * permits coherency to be managed when a new inode binds to an already extant
- * cache object.
- */
-struct nfs_fscache_inode_auxdata {
- struct timespec mtime;
- struct timespec ctime;
- loff_t size;
- u64 change_attr;
-};
-
-/*
- * Generate a key to describe an NFS inode in an NFS server's index
- */
-static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- const struct nfs_inode *nfsi = cookie_netfs_data;
- uint16_t nsize;
-
- /* use the inode's NFS filehandle as the key */
- nsize = nfsi->fh.size;
- memcpy(buffer, nfsi->fh.data, nsize);
- return nsize;
-}
-
-/*
- * Get certain file attributes from the netfs data
- * - This function can be absent for an index
- * - Not permitted to return an error
- * - The netfs data from the cookie being used as the source is presented
- */
-static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
- uint64_t *size)
-{
- const struct nfs_inode *nfsi = cookie_netfs_data;
-
- *size = nfsi->vfs_inode.i_size;
-}
-
-/*
- * Get the auxiliary data from netfs data
- * - This function can be absent if the index carries no state data
- * - Should store the auxiliary data in the buffer
- * - Should return the amount of amount stored
- * - Not permitted to return an error
- * - The netfs data from the cookie being used as the source is presented
- */
-static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
- void *buffer, uint16_t bufmax)
-{
- struct nfs_fscache_inode_auxdata auxdata;
- const struct nfs_inode *nfsi = cookie_netfs_data;
-
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.size = nfsi->vfs_inode.i_size;
- auxdata.mtime = nfsi->vfs_inode.i_mtime;
- auxdata.ctime = nfsi->vfs_inode.i_ctime;
-
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
-
- if (bufmax > sizeof(auxdata))
- bufmax = sizeof(auxdata);
-
- memcpy(buffer, &auxdata, bufmax);
- return bufmax;
-}
-
-/*
* Consult the netfs about the state of an object
* - This function can be absent if the index carries no state data
* - The netfs data from the cookie being used as the target is
@@ -230,7 +78,8 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
static
enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
const void *data,
- uint16_t datalen)
+ uint16_t datalen,
+ loff_t object_size)
{
struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = cookie_netfs_data;
@@ -239,7 +88,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.size = nfsi->vfs_inode.i_size;
auxdata.mtime = nfsi->vfs_inode.i_mtime;
auxdata.ctime = nfsi->vfs_inode.i_ctime;
@@ -288,9 +136,6 @@ static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
const struct fscache_cookie_def nfs_fscache_inode_object_def = {
.name = "NFS.fh",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
- .get_key = nfs_fscache_inode_get_key,
- .get_attr = nfs_fscache_inode_get_attr,
- .get_aux = nfs_fscache_inode_get_aux,
.check_aux = nfs_fscache_inode_check_aux,
.get_context = nfs_fh_get_context,
.put_context = nfs_fh_put_context,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index d63bea8bbfbb..b55fc7920c3b 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -18,6 +18,7 @@
#include <linux/in6.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "internal.h"
#include "iostat.h"
@@ -29,6 +30,21 @@ static struct rb_root nfs_fscache_keys = RB_ROOT;
static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+ struct {
+ uint16_t nfsversion; /* NFS protocol version */
+ uint16_t family; /* address family */
+ __be16 port; /* IP port */
+ } hdr;
+ union {
+ struct in_addr ipv4_addr; /* IPv4 address */
+ struct in6_addr ipv6_addr; /* IPv6 address */
+ };
+} __packed;
+
+/*
* Get the per-client index cookie for an NFS client if the appropriate mount
* flag was set
* - We always try and get an index cookie for the client, but get filehandle
@@ -36,10 +52,41 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
*/
void nfs_fscache_get_client_cookie(struct nfs_client *clp)
{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+ const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+ struct nfs_server_key key;
+ uint16_t len = sizeof(key.hdr);
+
+ memset(&key, 0, sizeof(key));
+ key.hdr.nfsversion = clp->rpc_ops->version;
+ key.hdr.family = clp->cl_addr.ss_family;
+
+ switch (clp->cl_addr.ss_family) {
+ case AF_INET:
+ key.hdr.port = sin->sin_port;
+ key.ipv4_addr = sin->sin_addr;
+ len += sizeof(key.ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key.hdr.port = sin6->sin6_port;
+ key.ipv6_addr = sin6->sin6_addr;
+ len += sizeof(key.ipv6_addr);
+ break;
+
+ default:
+ printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+ clp->cl_addr.ss_family);
+ clp->fscache = NULL;
+ return;
+ }
+
/* create a cache index for looking up filehandles */
clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
&nfs_fscache_server_index_def,
- clp, true);
+ &key, len,
+ NULL, 0,
+ clp, 0, true);
dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
clp, clp->fscache);
}
@@ -52,7 +99,7 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
clp, clp->fscache);
- fscache_relinquish_cookie(clp->fscache, 0);
+ fscache_relinquish_cookie(clp->fscache, NULL, false);
clp->fscache = NULL;
}
@@ -139,7 +186,9 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
/* create a cache index for looking up filehandles */
nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
&nfs_fscache_super_index_def,
- nfss, true);
+ key, sizeof(*key) + ulen,
+ NULL, 0,
+ nfss, 0, true);
dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
nfss, nfss->fscache);
return;
@@ -163,7 +212,7 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
nfss, nfss->fscache);
- fscache_relinquish_cookie(nfss->fscache, 0);
+ fscache_relinquish_cookie(nfss->fscache, NULL, false);
nfss->fscache = NULL;
if (nfss->fscache_key) {
@@ -180,14 +229,25 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
*/
void nfs_fscache_init_inode(struct inode *inode)
{
+ struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
nfsi->fscache = NULL;
if (!S_ISREG(inode->i_mode))
return;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
&nfs_fscache_inode_object_def,
- nfsi, false);
+ nfsi->fh.data, nfsi->fh.size,
+ &auxdata, sizeof(auxdata),
+ nfsi, nfsi->vfs_inode.i_size, false);
}
/*
@@ -195,12 +255,16 @@ void nfs_fscache_init_inode(struct inode *inode)
*/
void nfs_fscache_clear_inode(struct inode *inode)
{
+ struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
- fscache_relinquish_cookie(cookie, false);
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+ fscache_relinquish_cookie(cookie, &auxdata, false);
nfsi->fscache = NULL;
}
@@ -232,20 +296,26 @@ static bool nfs_fscache_can_enable(void *data)
*/
void nfs_fscache_open_file(struct inode *inode, struct file *filp)
{
+ struct nfs_fscache_inode_auxdata auxdata;
struct nfs_inode *nfsi = NFS_I(inode);
struct fscache_cookie *cookie = nfs_i_fscache(inode);
if (!fscache_cookie_valid(cookie))
return;
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
if (inode_is_open_for_write(inode)) {
dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
- fscache_disable_cookie(cookie, true);
+ fscache_disable_cookie(cookie, &auxdata, true);
fscache_uncache_all_inode_pages(cookie, inode);
} else {
dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
- fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+ fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size,
+ nfs_fscache_can_enable, inode);
if (fscache_cookie_enabled(cookie))
set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
}
@@ -422,7 +492,8 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
"NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
nfs_i_fscache(inode), page, page->index, page->flags, sync);
- ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
+ ret = fscache_write_page(nfs_i_fscache(inode), page,
+ inode->i_size, GFP_KERNEL);
dfprintk(FSCACHE,
"NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
page, page->index, page->flags, ret);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index d7fe3e799f2f..161ba2edb9d0 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -57,6 +57,21 @@ struct nfs_fscache_key {
};
/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode. This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+ struct timespec mtime;
+ struct timespec ctime;
+ u64 change_attr;
+};
+
+/*
* fscache-index.c
*/
extern struct fscache_netfs nfs_fscache_netfs;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d17a90c4fa37..bd15d0b57626 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -195,7 +195,10 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ bool have_delegation = nfs_have_delegated_attributes(inode);
+ if (have_delegation)
+ flags &= ~(NFS_INO_INVALID_CHANGE|NFS_INO_REVAL_PAGECACHE);
if (inode->i_mapping->nrpages == 0)
flags &= ~NFS_INO_INVALID_DATA;
nfsi->cache_validity |= flags;
@@ -447,7 +450,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_mode = fattr->mode;
if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
&& nfs_server_capable(inode, NFS_CAP_MODE))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
*/
@@ -493,37 +496,35 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
else if (nfs_server_capable(inode, NFS_CAP_ATIME))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
else if (nfs_server_capable(inode, NFS_CAP_MTIME))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
else if (nfs_server_capable(inode, NFS_CAP_CTIME))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
inode_set_iversion_raw(inode, fattr->change_attr);
else
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
- | NFS_INO_REVAL_PAGECACHE);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE);
if (fattr->valid & NFS_ATTR_FATTR_SIZE)
inode->i_size = nfs_size_to_loff_t(fattr->size);
else
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
- | NFS_INO_REVAL_PAGECACHE);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
set_nlink(inode, fattr->nlink);
else if (nfs_server_capable(inode, NFS_CAP_NLINK))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
else if (nfs_server_capable(inode, NFS_CAP_OWNER))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
if (fattr->valid & NFS_ATTR_FATTR_GROUP)
inode->i_gid = fattr->gid;
else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -608,11 +609,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
goto out;
}
- /*
- * Return any delegations if we're going to change ACLs
- */
- if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
- NFS_PROTO(inode)->return_delegation(inode);
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
if (error == 0)
error = nfs_refresh_inode(inode, fattr);
@@ -645,6 +641,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
/* Optimisation */
if (offset == 0)
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
spin_unlock(&inode->i_lock);
truncate_pagecache(inode, offset);
@@ -657,6 +654,7 @@ out:
* nfs_setattr_update_inode - Update inode metadata after a setattr call.
* @inode: pointer to struct inode
* @attr: pointer to struct iattr
+ * @fattr: pointer to struct nfs_fattr
*
* Note: we do this in the *proc.c in order to ensure that
* it works for things like exclusive creates too.
@@ -669,6 +667,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
spin_lock(&inode->i_lock);
NFS_I(inode)->attr_gencount = fattr->gencount;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
if ((attr->ia_valid & ATTR_MODE) != 0) {
int mode = attr->ia_mode & S_IALLUGO;
@@ -683,13 +683,12 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
| NFS_INO_INVALID_ACL);
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
if (fattr->valid)
nfs_update_inode(inode, fattr);
- else
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
@@ -1303,24 +1302,20 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
}
-static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
- unsigned long ret = 0;
-
if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
&& inode_eq_iversion_raw(inode, fattr->pre_change_attr)) {
inode_set_iversion_raw(inode, fattr->change_attr);
if (S_ISDIR(inode->i_mode))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
- ret |= NFS_INO_INVALID_ATTR;
}
/* If we have atomic WCC data, we may update some attributes */
if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- ret |= NFS_INO_INVALID_ATTR;
}
if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
@@ -1329,17 +1324,13 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
if (S_ISDIR(inode->i_mode))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
- ret |= NFS_INO_INVALID_ATTR;
}
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
&& !nfs_have_writebacks(inode)) {
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
- ret |= NFS_INO_INVALID_ATTR;
}
-
- return ret;
}
/**
@@ -1369,33 +1360,41 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (!nfs_file_has_buffered_writers(nfsi)) {
/* Verify a few of the more important attributes */
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
- invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
+ invalid |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_REVAL_PAGECACHE;
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
- invalid |= NFS_INO_INVALID_ATTR;
+ invalid |= NFS_INO_INVALID_MTIME;
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
- invalid |= NFS_INO_INVALID_ATTR;
+ invalid |= NFS_INO_INVALID_CTIME;
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
cur_size = i_size_read(inode);
new_isize = nfs_size_to_loff_t(fattr->size);
if (cur_size != new_isize)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ invalid |= NFS_INO_INVALID_SIZE
+ | NFS_INO_REVAL_PAGECACHE;
}
}
/* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
- invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
- invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
- invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
/* Has the link count changed? */
if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
- invalid |= NFS_INO_INVALID_ATTR;
+ invalid |= NFS_INO_INVALID_OTHER;
if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
invalid |= NFS_INO_INVALID_ATIME;
@@ -1597,10 +1596,9 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
}
EXPORT_SYMBOL_GPL(nfs_refresh_inode);
-static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+static int nfs_post_op_update_inode_locked(struct inode *inode,
+ struct nfs_fattr *fattr, unsigned int invalid)
{
- unsigned long invalid = NFS_INO_INVALID_ATTR;
-
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
nfs_set_cache_invalid(inode, invalid);
@@ -1629,7 +1627,9 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
spin_lock(&inode->i_lock);
nfs_fattr_set_barrier(fattr);
- status = nfs_post_op_update_inode_locked(inode, fattr);
+ status = nfs_post_op_update_inode_locked(inode, fattr,
+ NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
spin_unlock(&inode->i_lock);
return status;
@@ -1681,7 +1681,10 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
}
out_noforce:
- status = nfs_post_op_update_inode_locked(inode, fattr);
+ status = nfs_post_op_update_inode_locked(inode, fattr,
+ NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME);
return status;
}
@@ -1789,7 +1792,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
| NFS_INO_REVAL_PAGECACHE);
/* Do atomic weak cache consistency updates */
- invalid |= nfs_wcc_update_inode(inode, fattr);
+ nfs_wcc_update_inode(inode, fattr);
if (pnfs_layoutcommit_outstanding(inode)) {
nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
@@ -1803,17 +1806,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_sb->s_id, inode->i_ino);
/* Could it be a race with writeback? */
if (!have_writers) {
- invalid |= NFS_INO_INVALID_ATTR
+ invalid |= NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL;
+ /* Force revalidate of all attributes */
+ save_cache_validity |= NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_SIZE
+ | NFS_INO_INVALID_OTHER;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
}
inode_set_iversion_raw(inode, fattr->change_attr);
}
} else {
- nfsi->cache_validity |= save_cache_validity;
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_CHANGE
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
@@ -1821,7 +1832,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
} else if (server->caps & NFS_CAP_MTIME) {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
+ (NFS_INO_INVALID_MTIME
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
@@ -1830,7 +1841,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
} else if (server->caps & NFS_CAP_CTIME) {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
+ (NFS_INO_INVALID_CTIME
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
@@ -1845,7 +1856,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
i_size_write(inode, new_isize);
if (!have_writers)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ invalid |= NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
@@ -1856,7 +1867,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
} else {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
+ (NFS_INO_INVALID_SIZE
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
@@ -1877,55 +1888,61 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
umode_t newmode = inode->i_mode & S_IFMT;
newmode |= fattr->mode & S_IALLUGO;
inode->i_mode = newmode;
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
}
} else if (server->caps & NFS_CAP_MODE) {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ACCESS
+ (NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
inode->i_uid = fattr->uid;
}
} else if (server->caps & NFS_CAP_OWNER) {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ACCESS
+ (NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
inode->i_gid = fattr->gid;
}
} else if (server->caps & NFS_CAP_OWNER_GROUP) {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ACCESS
+ (NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
- invalid |= NFS_INO_INVALID_ATTR;
+ invalid |= NFS_INO_INVALID_OTHER;
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
}
} else if (server->caps & NFS_CAP_NLINK) {
nfsi->cache_validity |= save_cache_validity &
- (NFS_INO_INVALID_ATTR
+ (NFS_INO_INVALID_OTHER
| NFS_INO_REVAL_FORCED);
cache_revalidated = false;
}
@@ -1942,6 +1959,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
+ invalid &= ~NFS_INO_INVALID_ATTR;
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -1962,10 +1980,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->attr_gencount = fattr->gencount;
}
- /* Don't declare attrcache up to date if there were no attrs! */
- if (cache_revalidated)
- invalid &= ~NFS_INO_INVALID_ATTR;
-
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7327930ad970..eadf1ab31d16 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -138,8 +138,11 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- if (status == 0)
+ if (status == 0) {
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
nfs_setattr_update_inode(inode, sattr, fattr);
+ }
dprintk("NFS reply setattr: %d\n", status);
return status;
}
@@ -383,11 +386,11 @@ out:
}
static int
-nfs3_proc_remove(struct inode *dir, const struct qstr *name)
+nfs3_proc_remove(struct inode *dir, struct dentry *dentry)
{
struct nfs_removeargs arg = {
.fh = NFS_FH(dir),
- .name = *name,
+ .name = dentry->d_name,
};
struct nfs_removeres res;
struct rpc_message msg = {
@@ -397,7 +400,7 @@ nfs3_proc_remove(struct inode *dir, const struct qstr *name)
};
int status = -ENOMEM;
- dprintk("NFS call remove %s\n", name->name);
+ dprintk("NFS call remove %pd2\n", dentry);
res.dir_attr = nfs_alloc_fattr();
if (res.dir_attr == NULL)
goto out;
@@ -411,7 +414,7 @@ out:
}
static void
-nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
}
@@ -433,7 +436,9 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
}
static void
-nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+nfs3_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
}
@@ -908,12 +913,6 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
return 0;
}
-static int nfs3_return_delegation(struct inode *inode)
-{
- nfs_wb_all(inode);
- return 0;
-}
-
static const struct inode_operations nfs3_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -990,7 +989,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.clear_acl_cache = forget_all_cached_acls,
.close_context = nfs_close_context,
.have_delegation = nfs3_have_delegation,
- .return_delegation = nfs3_return_delegation,
.alloc_client = nfs_alloc_client,
.init_client = nfs_init_client,
.free_client = nfs_free_client,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cd33bd5da87..09ee36dd8426 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1997,6 +1997,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
struct nfs_entry old = *entry;
__be32 *p;
int error;
+ u64 new_cookie;
p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
@@ -2019,8 +2020,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (unlikely(error))
return error;
- entry->prev_cookie = entry->cookie;
- error = decode_cookie3(xdr, &entry->cookie);
+ error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
return error;
@@ -2054,6 +2054,9 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
zero_nfs_fh3(entry->fh);
}
+ entry->prev_cookie = entry->cookie;
+ entry->cookie = new_cookie;
+
return 0;
out_overflow:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47f3c273245e..b71757e85066 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1045,7 +1045,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
struct nfs_inode *nfsi = NFS_I(dir);
spin_lock(&dir->i_lock);
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ nfsi->cache_validity |= NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_DATA;
if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) {
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
nfsi->attrtimeo_timestamp = jiffies;
@@ -1669,6 +1671,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo
{
struct nfs_delegation *delegation;
+ fmode &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation == NULL || (delegation->type & fmode) == fmode) {
@@ -1751,12 +1754,16 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
}
if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
nfs_inode_set_delegation(state->inode,
- data->owner->so_cred,
- &data->o_res);
+ data->owner->so_cred,
+ data->o_res.delegation_type,
+ &data->o_res.delegation,
+ data->o_res.pagemod_limit);
else
nfs_inode_reclaim_delegation(state->inode,
- data->owner->so_cred,
- &data->o_res);
+ data->owner->so_cred,
+ data->o_res.delegation_type,
+ &data->o_res.delegation,
+ data->o_res.pagemod_limit);
}
/*
@@ -2743,27 +2750,40 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
* fields corresponding to attributes that were used to store the verifier.
* Make sure we clobber those fields in the later setattr call
*/
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
struct iattr *sattr, struct nfs4_label **label)
{
- const u32 *attrset = opendata->o_res.attrset;
+ const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask;
+ __u32 attrset[3];
+ unsigned ret;
+ unsigned i;
- if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
- !(sattr->ia_valid & ATTR_ATIME_SET))
- sattr->ia_valid |= ATTR_ATIME;
+ for (i = 0; i < ARRAY_SIZE(attrset); i++) {
+ attrset[i] = opendata->o_res.attrset[i];
+ if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1)
+ attrset[i] &= ~bitmask[i];
+ }
+
+ ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ?
+ sattr->ia_valid : 0;
- if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
- !(sattr->ia_valid & ATTR_MTIME_SET))
- sattr->ia_valid |= ATTR_MTIME;
+ if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) {
+ if (sattr->ia_valid & ATTR_ATIME_SET)
+ ret |= ATTR_ATIME_SET;
+ else
+ ret |= ATTR_ATIME;
+ }
- /* Except MODE, it seems harmless of setting twice. */
- if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
- (attrset[1] & FATTR4_WORD1_MODE ||
- attrset[2] & FATTR4_WORD2_MODE_UMASK))
- sattr->ia_valid &= ~ATTR_MODE;
+ if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) {
+ if (sattr->ia_valid & ATTR_MTIME_SET)
+ ret |= ATTR_MTIME_SET;
+ else
+ ret |= ATTR_MTIME;
+ }
- if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+ if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL))
*label = NULL;
+ return ret;
}
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2892,12 +2912,15 @@ static int _nfs4_do_open(struct inode *dir,
if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
- nfs4_exclusive_attrset(opendata, sattr, &label);
+ unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label);
/*
* send create attributes which was not set by open
* with an extra setattr.
*/
- if (sattr->ia_valid & NFS4_VALID_ATTRS) {
+ if (attrs || label) {
+ unsigned ia_old = sattr->ia_valid;
+
+ sattr->ia_valid = attrs;
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
@@ -2907,6 +2930,7 @@ static int _nfs4_do_open(struct inode *dir,
opendata->o_res.f_attr);
nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
}
+ sattr->ia_valid = ia_old;
}
}
if (opened && opendata->file_created)
@@ -3874,6 +3898,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
if (IS_ERR(label))
return PTR_ERR(label);
+ /* Return any delegations if we're going to change ACLs */
+ if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+ nfs4_inode_make_writeable(inode);
+
status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
if (status == 0) {
nfs_setattr_update_inode(inode, sattr, fattr);
@@ -4048,7 +4076,6 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_accessargs args = {
.fh = NFS_FH(inode),
- .bitmask = server->cache_consistency_bitmask,
.access = entry->mask,
};
struct nfs4_accessres res = {
@@ -4062,14 +4089,18 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
};
int status = 0;
- res.fattr = nfs_alloc_fattr();
- if (res.fattr == NULL)
- return -ENOMEM;
+ if (!nfs_have_delegated_attributes(inode)) {
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return -ENOMEM;
+ args.bitmask = server->cache_consistency_bitmask;
+ }
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (!status) {
nfs_access_set_mask(entry, res.access);
- nfs_refresh_inode(inode, res.fattr);
+ if (res.fattr)
+ nfs_refresh_inode(inode, res.fattr);
}
nfs_free_fattr(res.fattr);
return status;
@@ -4199,10 +4230,32 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name)
return status;
}
-static int nfs4_proc_remove(struct inode *dir, const struct qstr *name)
+static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs4_exception exception = { };
+ struct inode *inode = d_inode(dentry);
+ int err;
+
+ if (inode) {
+ if (inode->i_nlink == 1)
+ nfs4_inode_return_delegation(inode);
+ else
+ nfs4_inode_make_writeable(inode);
+ }
+ do {
+ err = _nfs4_proc_remove(dir, &dentry->d_name);
+ trace_nfs4_remove(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name)
{
struct nfs4_exception exception = { };
int err;
+
do {
err = _nfs4_proc_remove(dir, name);
trace_nfs4_remove(dir, name, err);
@@ -4212,17 +4265,20 @@ static int nfs4_proc_remove(struct inode *dir, const struct qstr *name)
return err;
}
-static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry)
{
- struct nfs_server *server = NFS_SERVER(dir);
struct nfs_removeargs *args = msg->rpc_argp;
struct nfs_removeres *res = msg->rpc_resp;
+ struct inode *inode = d_inode(dentry);
- res->server = server;
+ res->server = NFS_SB(dentry->d_sb);
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
nfs4_init_sequence(&args->seq_args, &res->seq_res, 1);
nfs_fattr_init(res->dir_attr);
+
+ if (inode)
+ nfs4_inode_return_delegation(inode);
}
static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -4248,14 +4304,21 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
return 1;
}
-static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+static void nfs4_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
{
- struct nfs_server *server = NFS_SERVER(dir);
struct nfs_renameargs *arg = msg->rpc_argp;
struct nfs_renameres *res = msg->rpc_resp;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+ if (old_inode)
+ nfs4_inode_make_writeable(old_inode);
+ if (new_inode)
+ nfs4_inode_return_delegation(new_inode);
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
- res->server = server;
+ res->server = NFS_SB(old_dentry->d_sb);
nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1);
}
@@ -4317,6 +4380,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
}
arg.bitmask = nfs4_bitmask(server, res.label);
+ nfs4_inode_make_writeable(inode);
+
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
update_changeattr(dir, &res.cinfo, res.fattr->time_start);
@@ -5310,7 +5375,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
- nfs4_inode_return_delegation(inode);
+ nfs4_inode_make_writeable(inode);
ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
/*
@@ -5325,7 +5390,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
* so mark the attribute cache invalid.
*/
spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME;
spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode);
nfs_zap_acl_cache(inode);
@@ -6621,22 +6687,24 @@ static int
nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
{
int ret;
- struct cb_notify_lock_args *cbnl = key;
struct nfs4_lock_waiter *waiter = wait->private;
- struct nfs_lowner *lowner = &cbnl->cbnl_owner,
- *wowner = waiter->owner;
- /* Only wake if the callback was for the same owner */
- if (lowner->clientid != wowner->clientid ||
- lowner->id != wowner->id ||
- lowner->s_dev != wowner->s_dev)
- return 0;
+ /* NULL key means to wake up everyone */
+ if (key) {
+ struct cb_notify_lock_args *cbnl = key;
+ struct nfs_lowner *lowner = &cbnl->cbnl_owner,
+ *wowner = waiter->owner;
- /* Make sure it's for the right inode */
- if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
- return 0;
+ /* Only wake if the callback was for the same owner. */
+ if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev)
+ return 0;
- waiter->notified = true;
+ /* Make sure it's for the right inode */
+ if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+ return 0;
+
+ waiter->notified = true;
+ }
/* override "private" so we can use default_wake_function */
wait->private = waiter->task;
@@ -6673,6 +6741,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
add_wait_queue(q, &wait);
while(!signalled()) {
+ waiter.notified = false;
status = nfs4_proc_setlk(state, cmd, request);
if ((status != -EAGAIN) || IS_SETLK(cmd))
break;
@@ -8414,6 +8483,8 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
{
switch(task->tk_status) {
case 0:
+ wake_up_all(&clp->cl_lock_waitq);
+ /* Fallthrough */
case -NFS4ERR_COMPLETE_ALREADY:
case -NFS4ERR_WRONG_CRED: /* What to do here? */
break;
@@ -9593,7 +9664,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.link = nfs4_proc_link,
.symlink = nfs4_proc_symlink,
.mkdir = nfs4_proc_mkdir,
- .rmdir = nfs4_proc_remove,
+ .rmdir = nfs4_proc_rmdir,
.readdir = nfs4_proc_readdir,
.mknod = nfs4_proc_mknod,
.statfs = nfs4_proc_statfs,
@@ -9614,7 +9685,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.close_context = nfs4_close_context,
.open_context = nfs4_atomic_open,
.have_delegation = nfs4_have_delegation,
- .return_delegation = nfs4_inode_return_delegation,
.alloc_client = nfs4_alloc_client,
.init_client = nfs4_init_client,
.free_client = nfs4_free_client,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 91a4d4eeb235..c10a422efe6f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -428,7 +428,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
struct rb_node **p = &server->state_owners.rb_node,
*parent = NULL;
struct nfs4_state_owner *sp;
- int err;
while (*p != NULL) {
parent = *p;
@@ -445,9 +444,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
return sp;
}
}
- err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id);
- if (err)
- return ERR_PTR(err);
rb_link_node(&new->so_server_node, parent, p);
rb_insert_color(&new->so_server_node, &server->state_owners);
return new;
@@ -460,7 +456,6 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
if (!RB_EMPTY_NODE(&sp->so_server_node))
rb_erase(&sp->so_server_node, &server->state_owners);
- ida_remove(&server->openowner_id, sp->so_seqid.owner_id);
}
static void
@@ -495,6 +490,12 @@ nfs4_alloc_state_owner(struct nfs_server *server,
sp = kzalloc(sizeof(*sp), gfp_flags);
if (!sp)
return NULL;
+ sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0,
+ gfp_flags);
+ if (sp->so_seqid.owner_id < 0) {
+ kfree(sp);
+ return NULL;
+ }
sp->so_server = server;
sp->so_cred = get_rpccred(cred);
spin_lock_init(&sp->so_lock);
@@ -526,6 +527,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
{
nfs4_destroy_seqid_counter(&sp->so_seqid);
put_rpccred(sp->so_cred);
+ ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
kfree(sp);
}
@@ -576,13 +578,9 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
new = nfs4_alloc_state_owner(server, cred, gfp_flags);
if (new == NULL)
goto out;
- do {
- if (ida_pre_get(&server->openowner_id, gfp_flags) == 0)
- break;
- spin_lock(&clp->cl_lock);
- sp = nfs4_insert_state_owner_locked(new);
- spin_unlock(&clp->cl_lock);
- } while (sp == ERR_PTR(-EAGAIN));
+ spin_lock(&clp->cl_lock);
+ sp = nfs4_insert_state_owner_locked(new);
+ spin_unlock(&clp->cl_lock);
if (sp != new)
nfs4_free_state_owner(new);
out:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 65c9c4175145..9b7392032321 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,7 +52,6 @@
#include <linux/nfs.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
-#include <linux/fs_struct.h>
#include "nfs4_fs.h"
#include "internal.h"
@@ -99,6 +98,7 @@ static int nfs4_stat_to_errno(int);
((3+NFS4_FHSIZE) >> 2))
#define nfs4_fattr_bitmap_maxsz 4
#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define nfstime4_maxsz (3)
#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
@@ -113,7 +113,8 @@ static int nfs4_stat_to_errno(int);
#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
/* This is based on getfattr, which uses the most attributes: */
#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
- 3 + 3 + 3 + nfs4_owner_maxsz + \
+ 3*nfstime4_maxsz + \
+ nfs4_owner_maxsz + \
nfs4_group_maxsz + nfs4_label_maxsz + \
decode_mdsthreshold_maxsz))
#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
@@ -124,7 +125,8 @@ static int nfs4_stat_to_errno(int);
nfs4_owner_maxsz + \
nfs4_group_maxsz + \
nfs4_label_maxsz + \
- 4 + 4)
+ 1 + nfstime4_maxsz + \
+ 1 + nfstime4_maxsz)
#define encode_savefh_maxsz (op_encode_hdr_maxsz)
#define decode_savefh_maxsz (op_decode_hdr_maxsz)
#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
@@ -958,6 +960,35 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0);
}
+static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr,
+ const __u32 *bitmap, size_t len)
+{
+ ssize_t ret;
+
+ /* Trim empty words */
+ while (len > 0 && bitmap[len-1] == 0)
+ len--;
+ ret = xdr_stream_encode_uint32_array(xdr, bitmap, len);
+ if (WARN_ON_ONCE(ret < 0))
+ return ret;
+ return len;
+}
+
+static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask,
+ __u32 *res, size_t len)
+{
+ size_t i;
+ __u32 tmp;
+
+ while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0))
+ len--;
+ for (i = len; i-- > 0;) {
+ tmp = bitmap[i] & mask[i];
+ res[i] = tmp;
+ }
+ return len;
+}
+
static void encode_nfs4_seqid(struct xdr_stream *xdr,
const struct nfs_seqid *seqid)
{
@@ -1012,6 +1043,14 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
}
+static __be32 *
+xdr_encode_nfstime4(__be32 *p, const struct timespec *t)
+{
+ p = xdr_encode_hyper(p, (__s64)t->tv_sec);
+ *p++ = cpu_to_be32(t->tv_nsec);
+ return p;
+}
+
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
const umode_t *umask,
@@ -1023,9 +1062,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
int owner_namelen = 0;
int owner_grouplen = 0;
__be32 *p;
- unsigned i;
uint32_t len = 0;
- uint32_t bmval_len;
uint32_t bmval[3] = { 0 };
/*
@@ -1073,7 +1110,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
if (iap->ia_valid & ATTR_ATIME_SET) {
bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
- len += 16;
+ len += 4 + (nfstime4_maxsz << 2);
} else if (iap->ia_valid & ATTR_ATIME) {
bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
len += 4;
@@ -1082,7 +1119,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
if (iap->ia_valid & ATTR_MTIME_SET) {
bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
- len += 16;
+ len += 4 + (nfstime4_maxsz << 2);
} else if (iap->ia_valid & ATTR_MTIME) {
bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 4;
@@ -1094,19 +1131,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
}
- if (bmval[2] != 0)
- bmval_len = 3;
- else if (bmval[1] != 0)
- bmval_len = 2;
- else
- bmval_len = 1;
-
- p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len);
-
- *p++ = cpu_to_be32(bmval_len);
- for (i = 0; i < bmval_len; i++)
- *p++ = cpu_to_be32(bmval[i]);
- *p++ = cpu_to_be32(len);
+ xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval));
+ xdr_stream_encode_opaque_inline(xdr, (void **)&p, len);
if (bmval[0] & FATTR4_WORD0_SIZE)
p = xdr_encode_hyper(p, iap->ia_size);
@@ -1119,16 +1145,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
if (iap->ia_valid & ATTR_ATIME_SET) {
*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
+ p = xdr_encode_nfstime4(p, &iap->ia_atime);
} else
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
if (iap->ia_valid & ATTR_MTIME_SET) {
*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+ p = xdr_encode_nfstime4(p, &iap->ia_mtime);
} else
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
@@ -1200,85 +1224,45 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
create->server, create->server->attr_bitmask);
}
-static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
-{
- __be32 *p;
-
- encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(1);
- *p = cpu_to_be32(bitmap);
-}
-
-static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
-{
- __be32 *p;
-
- encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
- p = reserve_space(xdr, 12);
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(bm0);
- *p = cpu_to_be32(bm1);
-}
-
-static void
-encode_getattr_three(struct xdr_stream *xdr,
- uint32_t bm0, uint32_t bm1, uint32_t bm2,
- struct compound_hdr *hdr)
+static void encode_getattr(struct xdr_stream *xdr,
+ const __u32 *bitmap, const __u32 *mask, size_t len,
+ struct compound_hdr *hdr)
{
- __be32 *p;
+ __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz];
encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
- if (bm2) {
- p = reserve_space(xdr, 16);
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(bm0);
- *p++ = cpu_to_be32(bm1);
- *p = cpu_to_be32(bm2);
- } else if (bm1) {
- p = reserve_space(xdr, 12);
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(bm0);
- *p = cpu_to_be32(bm1);
- } else {
- p = reserve_space(xdr, 8);
- *p++ = cpu_to_be32(1);
- *p = cpu_to_be32(bm0);
+ if (mask) {
+ if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap)))
+ len = ARRAY_SIZE(masked_bitmap);
+ len = mask_bitmap4(bitmap, mask, masked_bitmap, len);
+ bitmap = masked_bitmap;
}
+ xdr_encode_bitmap4(xdr, bitmap, len);
}
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
- bitmask[1] & nfs4_fattr_bitmap[1],
- bitmask[2] & nfs4_fattr_bitmap[2],
- hdr);
+ encode_getattr(xdr, nfs4_fattr_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fattr_bitmap), hdr);
}
static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
const u32 *open_bitmap,
struct compound_hdr *hdr)
{
- encode_getattr_three(xdr,
- bitmask[0] & open_bitmap[0],
- bitmask[1] & open_bitmap[1],
- bitmask[2] & open_bitmap[2],
- hdr);
+ encode_getattr(xdr, open_bitmap, bitmask, 3, hdr);
}
static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_three(xdr,
- bitmask[0] & nfs4_fsinfo_bitmap[0],
- bitmask[1] & nfs4_fsinfo_bitmap[1],
- bitmask[2] & nfs4_fsinfo_bitmap[2],
- hdr);
+ encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr);
}
static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
- bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
+ encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr);
}
static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
@@ -2117,7 +2101,8 @@ static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
encode_access(xdr, args->access, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2559,13 +2544,17 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
+ const __u32 nfs4_acl_bitmap[1] = {
+ [0] = FATTR4_WORD0_ACL,
+ };
uint32_t replen;
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
replen = hdr.replen + op_decode_hdr_maxsz;
- encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
+ encode_getattr(xdr, nfs4_acl_bitmap, NULL,
+ ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
args->acl_pages, 0, args->acl_len);
@@ -2644,8 +2633,8 @@ static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
- encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
- &hdr);
+ encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask,
+ ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr);
encode_nops(&hdr);
}
@@ -2663,8 +2652,8 @@ static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
- encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
- args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
+ encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask,
+ ARRAY_SIZE(nfs4_statfs_bitmap), &hdr);
encode_nops(&hdr);
}
@@ -2684,7 +2673,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
- encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
+ encode_getattr(xdr, bitmask, NULL, 3, &hdr);
encode_nops(&hdr);
}
@@ -3218,34 +3207,27 @@ static int decode_ace(struct xdr_stream *xdr, void *ace)
return -EIO;
}
-static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+static ssize_t
+decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz)
{
- uint32_t bmlen;
- __be32 *p;
-
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p))
- goto out_overflow;
- bmlen = be32_to_cpup(p);
+ ssize_t ret;
- bitmap[0] = bitmap[1] = bitmap[2] = 0;
- p = xdr_inline_decode(xdr, (bmlen << 2));
- if (unlikely(!p))
- goto out_overflow;
- if (bmlen > 0) {
- bitmap[0] = be32_to_cpup(p++);
- if (bmlen > 1) {
- bitmap[1] = be32_to_cpup(p++);
- if (bmlen > 2)
- bitmap[2] = be32_to_cpup(p);
- }
- }
- return 0;
-out_overflow:
+ ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
+ if (likely(ret >= 0))
+ return ret;
+ if (ret == -EMSGSIZE)
+ return sz;
print_overflow_msg(__func__, xdr);
return -EIO;
}
+static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+ ssize_t ret;
+ ret = decode_bitmap4(xdr, bitmap, 3);
+ return ret < 0 ? ret : 0;
+}
+
static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep)
{
__be32 *p;
@@ -3981,7 +3963,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
bitmap[1] &= ~FATTR4_WORD1_OWNER;
if (owner_name != NULL) {
- len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT);
+ len = decode_nfs4_string(xdr, owner_name, GFP_NOIO);
if (len <= 0)
goto out;
dprintk("%s: name=%s\n", __func__, owner_name->data);
@@ -4016,7 +3998,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
if (group_name != NULL) {
- len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT);
+ len = decode_nfs4_string(xdr, group_name, GFP_NOIO);
if (len <= 0)
goto out;
dprintk("%s: name=%s\n", __func__, group_name->data);
@@ -4156,19 +4138,25 @@ out_overflow:
return -EIO;
}
+static __be32 *
+xdr_decode_nfstime4(__be32 *p, struct timespec *t)
+{
+ __u64 sec;
+
+ p = xdr_decode_hyper(p, &sec);
+ t-> tv_sec = (time_t)sec;
+ t->tv_nsec = be32_to_cpup(p++);
+ return p;
+}
+
static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
{
__be32 *p;
- uint64_t sec;
- uint32_t nsec;
- p = xdr_inline_decode(xdr, 12);
+ p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
if (unlikely(!p))
goto out_overflow;
- p = xdr_decode_hyper(p, &sec);
- nsec = be32_to_cpup(p);
- time->tv_sec = (time_t)sec;
- time->tv_nsec = (long)nsec;
+ xdr_decode_nfstime4(p, time);
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -5471,21 +5459,13 @@ decode_savefh(struct xdr_stream *xdr)
static int decode_setattr(struct xdr_stream *xdr)
{
- __be32 *p;
- uint32_t bmlen;
int status;
status = decode_op_hdr(xdr, OP_SETATTR);
if (status)
return status;
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p))
- goto out_overflow;
- bmlen = be32_to_cpup(p);
- p = xdr_inline_decode(xdr, bmlen << 2);
- if (likely(p))
+ if (decode_bitmap4(xdr, NULL, 0) >= 0)
return 0;
-out_overflow:
print_overflow_msg(__func__, xdr);
return -EIO;
}
@@ -6256,7 +6236,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_access(xdr, &res->supported, &res->access);
if (status != 0)
goto out;
- decode_getfattr(xdr, res->fattr, res->server);
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -7536,6 +7517,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
unsigned int savep;
uint32_t bitmap[3] = {0};
uint32_t len;
+ uint64_t new_cookie;
__be32 *p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
goto out_overflow;
@@ -7552,8 +7534,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
goto out_overflow;
- entry->prev_cookie = entry->cookie;
- p = xdr_decode_hyper(p, &entry->cookie);
+ p = xdr_decode_hyper(p, &new_cookie);
entry->len = be32_to_cpup(p);
p = xdr_inline_decode(xdr, entry->len);
@@ -7587,6 +7568,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+ entry->prev_cookie = entry->cookie;
+ entry->cookie = new_cookie;
+
return 0;
out_overflow:
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f7fd9192d4bc..4e93d6308733 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -300,11 +300,11 @@ out:
}
static int
-nfs_proc_remove(struct inode *dir, const struct qstr *name)
+nfs_proc_remove(struct inode *dir, struct dentry *dentry)
{
struct nfs_removeargs arg = {
.fh = NFS_FH(dir),
- .name = *name,
+ .name = dentry->d_name,
};
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
@@ -312,7 +312,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name)
};
int status;
- dprintk("NFS call remove %s\n", name->name);
+ dprintk("NFS call remove %pd2\n",dentry);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
@@ -321,7 +321,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name)
}
static void
-nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
+nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry)
{
msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
}
@@ -338,7 +338,9 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
}
static void
-nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
+nfs_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
{
msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
}
@@ -671,12 +673,6 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags)
return 0;
}
-static int nfs_return_delegation(struct inode *inode)
-{
- nfs_wb_all(inode);
- return 0;
-}
-
static const struct inode_operations nfs_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -741,7 +737,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.lock_check_bounds = nfs_lock_check_bounds,
.close_context = nfs_close_context,
.have_delegation = nfs_have_delegation,
- .return_delegation = nfs_return_delegation,
.alloc_client = nfs_alloc_client,
.init_client = nfs_init_client,
.free_client = nfs_free_client,
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 630b4a3c1a93..bf54fc9ae135 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -105,7 +105,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data)
data->args.fh = NFS_FH(dir);
nfs_fattr_init(data->res.dir_attr);
- NFS_PROTO(dir)->unlink_setup(&msg, dir);
+ NFS_PROTO(dir)->unlink_setup(&msg, data->dentry);
task_setup_data.rpc_client = NFS_CLIENT(dir);
task = rpc_run_task(&task_setup_data);
@@ -386,7 +386,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
nfs_sb_active(old_dir->i_sb);
- NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir);
+ NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry);
return rpc_run_task(&task_setup_data);
}
@@ -463,9 +463,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
fileid = NFS_FILEID(d_inode(dentry));
- /* Return delegation in anticipation of the rename */
- NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry));
-
sdentry = NULL;
do {
int slen;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6579f3b367bd..0193053bc139 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -231,6 +231,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
if (i_size >= end)
goto out;
i_size_write(inode, end);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
out:
spin_unlock(&inode->i_lock);
@@ -1562,8 +1563,11 @@ static int nfs_writeback_done(struct rpc_task *task,
}
/* Deal with the suid/sgid bit corner case */
- if (nfs_should_remove_suid(inode))
- nfs_mark_for_revalidate(inode);
+ if (nfs_should_remove_suid(inode)) {
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ spin_unlock(&inode->i_lock);
+ }
return 0;
}
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 70b8bf781fce..a43dfedd69ec 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
if (!buf)
return -ENOMEM;
- rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+ rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq)) {
error = -ENOMEM;
goto out_free_buf;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2410b093a2e6..b0555d7d8200 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1201,6 +1201,28 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
break;
case S_IFDIR:
host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+ if (!host_err && unlikely(d_unhashed(dchild))) {
+ struct dentry *d;
+ d = lookup_one_len(dchild->d_name.name,
+ dchild->d_parent,
+ dchild->d_name.len);
+ if (IS_ERR(d)) {
+ host_err = PTR_ERR(d);
+ break;
+ }
+ if (unlikely(d_is_negative(d))) {
+ dput(d);
+ err = nfserr_serverfault;
+ goto out;
+ }
+ dput(resfhp->fh_dentry);
+ resfhp->fh_dentry = dget(d);
+ err = fh_update(resfhp);
+ dput(dchild);
+ dchild = d;
+ if (err)
+ goto out;
+ }
break;
case S_IFCHR:
case S_IFBLK:
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c21e0b4454a6..dec98cab729d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -193,9 +193,9 @@ retry:
(unsigned long long)oldkey,
(unsigned long long)newkey);
- spin_lock_irq(&btnc->tree_lock);
- err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
- spin_unlock_irq(&btnc->tree_lock);
+ xa_lock_irq(&btnc->i_pages);
+ err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
+ xa_unlock_irq(&btnc->i_pages);
/*
* Note: page->index will not change to newkey until
* nilfs_btnode_commit_change_key() will be called.
@@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
(unsigned long long)newkey);
mark_buffer_dirty(obh);
- spin_lock_irq(&btnc->tree_lock);
- radix_tree_delete(&btnc->page_tree, oldkey);
- radix_tree_tag_set(&btnc->page_tree, newkey,
+ xa_lock_irq(&btnc->i_pages);
+ radix_tree_delete(&btnc->i_pages, oldkey);
+ radix_tree_tag_set(&btnc->i_pages, newkey,
PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&btnc->tree_lock);
+ xa_unlock_irq(&btnc->i_pages);
opage->index = obh->b_blocknr = newkey;
unlock_page(opage);
@@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
return;
if (nbh == NULL) { /* blocksize == pagesize */
- spin_lock_irq(&btnc->tree_lock);
- radix_tree_delete(&btnc->page_tree, newkey);
- spin_unlock_irq(&btnc->tree_lock);
+ xa_lock_irq(&btnc->i_pages);
+ radix_tree_delete(&btnc->i_pages, newkey);
+ xa_unlock_irq(&btnc->i_pages);
unlock_page(ctxt->bh->b_page);
} else
brelse(nbh);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1a2894aa0194..dd52d3f82e8d 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -46,8 +46,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
int err = nilfs_add_link(dentry, inode);
if (!err) {
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
return 0;
}
inode_dec_link_count(inode);
@@ -243,8 +242,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
nilfs_mark_inode_dirty(inode);
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
out:
if (!err)
err = nilfs_transaction_commit(dir->i_sb);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 68241512d7c1..4cb850a6f1c2 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -331,15 +331,15 @@ repeat:
struct page *page2;
/* move the page to the destination cache */
- spin_lock_irq(&smap->tree_lock);
- page2 = radix_tree_delete(&smap->page_tree, offset);
+ xa_lock_irq(&smap->i_pages);
+ page2 = radix_tree_delete(&smap->i_pages, offset);
WARN_ON(page2 != page);
smap->nrpages--;
- spin_unlock_irq(&smap->tree_lock);
+ xa_unlock_irq(&smap->i_pages);
- spin_lock_irq(&dmap->tree_lock);
- err = radix_tree_insert(&dmap->page_tree, offset, page);
+ xa_lock_irq(&dmap->i_pages);
+ err = radix_tree_insert(&dmap->i_pages, offset, page);
if (unlikely(err < 0)) {
WARN_ON(err == -EEXIST);
page->mapping = NULL;
@@ -348,11 +348,11 @@ repeat:
page->mapping = dmap;
dmap->nrpages++;
if (PageDirty(page))
- radix_tree_tag_set(&dmap->page_tree,
+ radix_tree_tag_set(&dmap->i_pages,
offset,
PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&dmap->tree_lock);
+ xa_unlock_irq(&dmap->i_pages);
}
unlock_page(page);
}
@@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
if (mapping) {
- spin_lock_irq(&mapping->tree_lock);
+ xa_lock_irq(&mapping->i_pages);
if (test_bit(PG_dirty, &page->flags)) {
- radix_tree_tag_clear(&mapping->page_tree,
+ radix_tree_tag_clear(&mapping->i_pages,
page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return clear_page_dirty_for_io(page);
}
- spin_unlock_irq(&mapping->tree_lock);
+ xa_unlock_irq(&mapping->i_pages);
return 0;
}
return TestClearPageDirty(page);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d51e1bb781cf..d94e8031fe5f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
u32 event_mask,
const void *data, int data_type)
{
- __u32 marks_mask, marks_ignored_mask;
+ __u32 marks_mask = 0, marks_ignored_mask = 0;
const struct path *path = data;
pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
@@ -108,24 +108,20 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
!d_can_lookup(path->dentry))
return false;
- if (inode_mark && vfsmnt_mark) {
- marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
- marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
- } else if (inode_mark) {
- /*
- * if the event is for a child and this inode doesn't care about
- * events on the child, don't send it!
- */
- if ((event_mask & FS_EVENT_ON_CHILD) &&
- !(inode_mark->mask & FS_EVENT_ON_CHILD))
- return false;
- marks_mask = inode_mark->mask;
- marks_ignored_mask = inode_mark->ignored_mask;
- } else if (vfsmnt_mark) {
- marks_mask = vfsmnt_mark->mask;
- marks_ignored_mask = vfsmnt_mark->ignored_mask;
- } else {
- BUG();
+ /*
+ * if the event is for a child and this inode doesn't care about
+ * events on the child, don't send it!
+ */
+ if (inode_mark &&
+ (!(event_mask & FS_EVENT_ON_CHILD) ||
+ (inode_mark->mask & FS_EVENT_ON_CHILD))) {
+ marks_mask |= inode_mark->mask;
+ marks_ignored_mask |= inode_mark->ignored_mask;
+ }
+
+ if (vfsmnt_mark) {
+ marks_mask |= vfsmnt_mark->mask;
+ marks_ignored_mask |= vfsmnt_mark->ignored_mask;
}
if (d_is_dir(path->dentry) &&
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 219b269c737e..613ec7e5a465 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -192,8 +192,9 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *group = NULL;
- __u32 inode_test_mask = 0;
- __u32 vfsmount_test_mask = 0;
+ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ __u32 marks_mask = 0;
+ __u32 marks_ignored_mask = 0;
if (unlikely(!inode_mark && !vfsmount_mark)) {
BUG();
@@ -213,29 +214,25 @@ static int send_to_group(struct inode *to_tell,
/* does the inode mark tell us to do something? */
if (inode_mark) {
group = inode_mark->group;
- inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
- inode_test_mask &= inode_mark->mask;
- inode_test_mask &= ~inode_mark->ignored_mask;
+ marks_mask |= inode_mark->mask;
+ marks_ignored_mask |= inode_mark->ignored_mask;
}
/* does the vfsmount_mark tell us to do something? */
if (vfsmount_mark) {
- vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
group = vfsmount_mark->group;
- vfsmount_test_mask &= vfsmount_mark->mask;
- vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
- if (inode_mark)
- vfsmount_test_mask &= ~inode_mark->ignored_mask;
+ marks_mask |= vfsmount_mark->mask;
+ marks_ignored_mask |= vfsmount_mark->ignored_mask;
}
pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
- " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+ " vfsmount_mark=%p marks_mask=%x marks_ignored_mask=%x"
" data=%p data_is=%d cookie=%d\n",
- __func__, group, to_tell, mask, inode_mark,
- inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+ __func__, group, to_tell, mask, inode_mark, vfsmount_mark,
+ marks_mask, marks_ignored_mask, data,
data_is, cookie);
- if (!inode_test_mask && !vfsmount_test_mask)
+ if (!(test_mask & marks_mask & ~marks_ignored_mask))
return 0;
return group->ops->handle_event(group, to_tell, inode_mark,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2831f495a674..32c523cf5a2d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -381,7 +381,7 @@ unm_err_out:
* vfs inode dirty. This ensures that any changes to the mft record are
* written out to disk.
*
- * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
+ * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
* on the base vfs inode, because even though file data may have been modified,
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
@@ -407,7 +407,7 @@ void __mark_mft_record_dirty(ntfs_inode *ni)
else
base_ni = ni->ext.base_ntfs_ino;
mutex_unlock(&ni->extent_lock);
- __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
}
static const char *ntfs_please_email = "Please email "
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9a876bb07cac..0f157bbd3e0f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7119,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_commit;
did_quota = 1;
- data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ data_ac->ac_resv = &oi->ip_la_data_resv;
ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&num);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e8e205bf2e41..302cd7caa4a7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -346,7 +346,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
unlock = 0;
out_alloc:
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
out_inode_unlock:
ocfs2_inode_unlock(inode, 0);
out:
@@ -2213,7 +2213,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
down_write(&oi->ip_alloc_sem);
if (first_get_block) {
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(osb))
ret = ocfs2_zero_tail(inode, di_bh, pos);
else
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 8614ff069d99..3494a62ed749 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,7 +78,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
/*
* Using a named enum representing lock types in terms of #N bit stored in
* iocb->private, which is going to be used for communication between
- * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ * ocfs2_dio_end_io() and ocfs2_file_write/read_iter().
*/
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 977763d4c27d..b048d4fa3959 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3072,7 +3072,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We need to return the correct block within the
* cluster which should hold our entry.
*/
- off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+ off = ocfs2_dx_dir_hash_idx(osb,
&lookup->dl_hinfo);
get_bh(dx_leaves[off]);
lookup->dl_dx_leaf_bh = dx_leaves[off];
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fd6bbbbd7d78..39831fc2fd52 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -224,14 +224,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
dlm_astlockfunc_t *fn;
- struct dlm_lockstatus *lksb;
mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
res->lockname.len, res->lockname.name,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
- lksb = lock->lksb;
fn = lock->ast;
BUG_ON(lock->ml.node != dlm->node_num);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e9f3705c4c9f..d06e27ec4be4 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
u8 node_num;
u32 key;
u8 joining_node;
+ u8 migrate_done; /* set to 1 means node has migrated all lock resources */
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -960,13 +961,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
-u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-int dlm_nm_init(struct dlm_ctxt *dlm);
-int dlm_heartbeat_init(struct dlm_ctxt *dlm);
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index e1fea149f50b..425081be6161 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -461,6 +461,19 @@ redo_bucket:
cond_resched_lock(&dlm->spinlock);
num += n;
}
+
+ if (!num) {
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "%s: perhaps there are more lock resources "
+ "need to be migrated after dlm recovery\n", dlm->name);
+ ret = -EAGAIN;
+ } else {
+ mlog(0, "%s: we won't do dlm recovery after migrating "
+ "all lock resources\n", dlm->name);
+ dlm->migrate_done = 1;
+ }
+ }
+
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
@@ -675,20 +688,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
spin_unlock(&dlm->spinlock);
}
-int dlm_shutting_down(struct dlm_ctxt *dlm)
-{
- int ret = 0;
-
- spin_lock(&dlm_domain_lock);
-
- if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
- ret = 1;
-
- spin_unlock(&dlm_domain_lock);
-
- return ret;
-}
-
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
@@ -2052,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
init_waitqueue_head(&dlm->dlm_join_events);
+ dlm->migrate_done = 0;
+
dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index fd6122a38dbd..8a9281411c18 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,30 @@
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
-int dlm_shutting_down(struct dlm_ctxt *dlm);
+static inline int dlm_joined(struct dlm_ctxt *dlm)
+{
+ int ret = 0;
+
+ spin_lock(&dlm_domain_lock);
+ if (dlm->dlm_state == DLM_CTXT_JOINED)
+ ret = 1;
+ spin_unlock(&dlm_domain_lock);
+
+ return ret;
+}
+
+static inline int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+ int ret = 0;
+
+ spin_lock(&dlm_domain_lock);
+ if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+ ret = 1;
+ spin_unlock(&dlm_domain_lock);
+
+ return ret;
+}
+
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 66c2a491f68d..74962315794e 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -77,8 +77,7 @@ int dlm_init_lock_cache(void)
void dlm_destroy_lock_cache(void)
{
- if (dlm_lock_cache)
- kmem_cache_destroy(dlm_lock_cache);
+ kmem_cache_destroy(dlm_lock_cache);
}
/* Tell us whether we can grant a new lock request.
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a7df226f9449..aaca0949fe53 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -414,8 +414,7 @@ int dlm_init_mle_cache(void)
void dlm_destroy_mle_cache(void)
{
- if (dlm_mle_cache)
- kmem_cache_destroy(dlm_mle_cache);
+ kmem_cache_destroy(dlm_mle_cache);
}
static void dlm_mle_release(struct kref *kref)
@@ -472,15 +471,11 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache) {
- kmem_cache_destroy(dlm_lockname_cache);
- dlm_lockname_cache = NULL;
- }
+ kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
- if (dlm_lockres_cache) {
- kmem_cache_destroy(dlm_lockres_cache);
- dlm_lockres_cache = NULL;
- }
+ kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
}
static void dlm_lockres_release(struct kref *kref)
@@ -2495,13 +2490,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
}
/*
- * A migrateable resource is one that is :
+ * A migratable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
-static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
@@ -2532,7 +2527,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
- mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+ mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
@@ -2548,7 +2543,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
return 0;
}
- mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len,
res->lockname.name);
return 1;
@@ -2792,7 +2787,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
- if (dlm_is_lockres_migrateable(dlm, res))
+ if (dlm_is_lockres_migratable(dlm, res))
target = dlm_pick_migration_target(dlm, res);
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ec8f75813beb..802636d50365 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -62,7 +62,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_request_all_locks(struct dlm_ctxt *dlm,
u8 request_from, u8 dead_node);
-static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);
static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
@@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
- spin_lock(&dlm->spinlock);
+ assert_spin_locked(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
- spin_unlock(&dlm->spinlock);
}
static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
spin_lock(&dlm->spinlock);
+ if (dlm->migrate_done) {
+ mlog(0, "%s: no need do recovery after migrating all "
+ "lock resources\n", dlm->name);
+ spin_unlock(&dlm->spinlock);
+ return 0;
+ }
+
/* check to see if the new master has died */
if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.dead_node);
- spin_unlock(&dlm->spinlock);
/* take write barrier */
/* (stops the list reshuffling thread, proxy ast handling) */
dlm_begin_recovery(dlm);
+ spin_unlock(&dlm->spinlock);
+
if (dlm->reco.new_master == dlm->node_num)
goto master_here;
@@ -739,7 +746,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
if (destroy)
- dlm_destroy_recovery_area(dlm, dead_node);
+ dlm_destroy_recovery_area(dlm);
return status;
}
@@ -764,7 +771,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
if (!ndata) {
- dlm_destroy_recovery_area(dlm, dead_node);
+ dlm_destroy_recovery_area(dlm);
return -ENOMEM;
}
ndata->node_num = num;
@@ -778,7 +785,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
return 0;
}
-static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
{
struct dlm_reco_node_data *ndata, *next;
LIST_HEAD(tmplist);
@@ -1378,6 +1385,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
if (!dlm_grab(dlm))
return -EINVAL;
+ if (!dlm_joined(dlm)) {
+ mlog(ML_ERROR, "Domain %s not joined! "
+ "lockres %.*s, master %u\n",
+ dlm->name, mres->lockname_len,
+ mres->lockname, mres->master);
+ dlm_put(dlm);
+ return -EINVAL;
+ }
+
BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
real_master = mres->master;
@@ -1807,7 +1823,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
int i, j, bad;
struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
- unsigned int added = 0;
__be64 c;
mlog(0, "running %d locks for this lockres\n", mres->num_locks);
@@ -1823,7 +1838,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
- added++;
break;
}
BUG_ON(ml->highest_blocked != LKM_IVMODE);
@@ -1911,7 +1925,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, queue);
spin_unlock(&res->spinlock);
- added++;
mlog(0, "just reordered a local lock!\n");
continue;
@@ -2037,7 +2050,6 @@ skip_lvb:
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
dlm_lockres_set_refmap_bit(dlm, res, ml->node);
- added++;
}
spin_unlock(&res->spinlock);
}
@@ -2331,13 +2343,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
__dlm_dirty_lockres(dlm, res);
}
-/* if this node is the recovery master, and there are no
- * locks for a given lockres owned by this node that are in
- * either PR or EX mode, zero out the lvb before requesting.
- *
- */
-
-
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
struct dlm_lock_resource *res;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 9479f99c2145..97a972efab83 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1756,8 +1756,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
- 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1796,7 +1795,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
write ? "EXMODE" : "PRMODE");
if (!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
}
/*
@@ -1816,8 +1815,7 @@ int ocfs2_open_lock(struct inode *inode)
lockres = &OCFS2_I(inode)->ip_open_lockres;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR, 0, 0);
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1854,8 +1852,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
* other nodes and the -EAGAIN will indicate to the caller that
* this inode is still in use.
*/
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
- level, DLM_LKF_NOQUEUE, 0);
+ status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
out:
return status;
@@ -1876,11 +1873,9 @@ void ocfs2_open_unlock(struct inode *inode)
goto out;
if(lockres->l_ro_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_PR);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
if(lockres->l_ex_holders)
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
- DLM_LOCK_EX);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
out:
return;
@@ -2601,9 +2596,9 @@ void ocfs2_inode_unlock(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+ if (!ocfs2_is_hard_readonly(osb) &&
!ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_cluster_unlock(osb, lockres, level);
}
/*
@@ -3537,7 +3532,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
* On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
* expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
* we can recover correctly from node failure. Otherwise, we may get
- * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+ * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
*/
if (!ocfs2_is_o2cb_active() &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5d1784a365a3..6ee94bc23f5b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -101,7 +101,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
trace_ocfs2_file_open(inode, file, file->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)oi->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name, mode);
@@ -116,7 +116,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
/* Check that the inode hasn't been wiped from disk by another
* node. If it hasn't then we're safe as long as we hold the
* spin lock until our increment of open count. */
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+ if (oi->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&oi->ip_lock);
status = -ENOENT;
@@ -190,7 +190,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
bool needs_barrier = false;
trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
- OCFS2_I(inode)->ip_blkno,
+ oi->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
@@ -296,7 +296,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
ocfs2_journal_dirty(handle, bh);
out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ ocfs2_commit_trans(osb, handle);
out:
return ret;
}
@@ -2257,7 +2257,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
- trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+ trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
@@ -2405,7 +2405,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
- trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+ trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
filp->f_path.dentry->d_name.len,
filp->f_path.dentry->d_name.name,
@@ -2448,7 +2448,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
*
* Take and drop the meta data lock to update inode fields
* like i_size. This allows the checks down below
- * generic_file_aio_read() a chance of actually working.
+ * generic_file_read_iter() a chance of actually working.
*/
ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
!nowait);
@@ -2460,7 +2460,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
ocfs2_inode_unlock(inode, lock_level);
ret = generic_file_read_iter(iocb, to);
- trace_generic_file_aio_read_ret(ret);
+ trace_generic_file_read_iter_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 6b92cb241138..f65f2b2f594d 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = {
"UNSUPPORTED"
};
-static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
-static LIST_HEAD(ocfs2_filecheck_sysfs_list);
-
-struct ocfs2_filecheck {
- struct list_head fc_head; /* File check entry list head */
- spinlock_t fc_lock;
- unsigned int fc_max; /* Maximum number of entry in list */
- unsigned int fc_size; /* Current entry count in list */
- unsigned int fc_done; /* Finished entry count in list */
-};
-
-struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
- struct list_head fs_list;
- atomic_t fs_count;
- struct super_block *fs_sb;
- struct kset *fs_devicekset;
- struct kset *fs_fcheckkset;
- struct ocfs2_filecheck *fs_fcheck;
-};
-
-#define OCFS2_FILECHECK_MAXSIZE 100
-#define OCFS2_FILECHECK_MINSIZE 10
-
-/* File check operation type */
-enum {
- OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
- OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
- OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
-};
-
struct ocfs2_filecheck_entry {
struct list_head fe_list;
unsigned long fe_ino;
@@ -110,35 +80,84 @@ ocfs2_filecheck_error(int errno)
return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
}
-static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf);
-static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count);
-static struct kobj_attribute ocfs2_attr_filecheck_chk =
+static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_filecheck_attr_chk =
__ATTR(check, S_IRUSR | S_IWUSR,
- ocfs2_filecheck_show,
- ocfs2_filecheck_store);
-static struct kobj_attribute ocfs2_attr_filecheck_fix =
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct kobj_attribute ocfs2_filecheck_attr_fix =
__ATTR(fix, S_IRUSR | S_IWUSR,
- ocfs2_filecheck_show,
- ocfs2_filecheck_store);
-static struct kobj_attribute ocfs2_attr_filecheck_set =
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct kobj_attribute ocfs2_filecheck_attr_set =
__ATTR(set, S_IRUSR | S_IWUSR,
- ocfs2_filecheck_show,
- ocfs2_filecheck_store);
+ ocfs2_filecheck_attr_show,
+ ocfs2_filecheck_attr_store);
+static struct attribute *ocfs2_filecheck_attrs[] = {
+ &ocfs2_filecheck_attr_chk.attr,
+ &ocfs2_filecheck_attr_fix.attr,
+ &ocfs2_filecheck_attr_set.attr,
+ NULL
+};
+
+static void ocfs2_filecheck_release(struct kobject *kobj)
+{
+ struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
+
+ complete(&entry->fs_kobj_unregister);
+}
+
+static ssize_t
+ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ ssize_t ret = -EIO;
+ struct kobj_attribute *kattr = container_of(attr,
+ struct kobj_attribute, attr);
+
+ kobject_get(kobj);
+ if (kattr->show)
+ ret = kattr->show(kobj, kattr, buf);
+ kobject_put(kobj);
+ return ret;
+}
+
+static ssize_t
+ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret = -EIO;
+ struct kobj_attribute *kattr = container_of(attr,
+ struct kobj_attribute, attr);
+
+ kobject_get(kobj);
+ if (kattr->store)
+ ret = kattr->store(kobj, kattr, buf, count);
+ kobject_put(kobj);
+ return ret;
+}
+
+static const struct sysfs_ops ocfs2_filecheck_ops = {
+ .show = ocfs2_filecheck_show,
+ .store = ocfs2_filecheck_store,
+};
+
+static struct kobj_type ocfs2_ktype_filecheck = {
+ .default_attrs = ocfs2_filecheck_attrs,
+ .sysfs_ops = &ocfs2_filecheck_ops,
+ .release = ocfs2_filecheck_release,
+};
static void
ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
{
struct ocfs2_filecheck_entry *p;
- if (!atomic_dec_and_test(&entry->fs_count)) {
- wait_var_event(&entry->fs_count,
- !atomic_read(&entry->fs_count));
- }
-
spin_lock(&entry->fs_fcheck->fc_lock);
while (!list_empty(&entry->fs_fcheck->fc_head)) {
p = list_first_entry(&entry->fs_fcheck->fc_head,
@@ -149,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
}
spin_unlock(&entry->fs_fcheck->fc_lock);
- kset_unregister(entry->fs_fcheckkset);
- kset_unregister(entry->fs_devicekset);
kfree(entry->fs_fcheck);
- kfree(entry);
-}
-
-static void
-ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
-{
- spin_lock(&ocfs2_filecheck_sysfs_lock);
- list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ entry->fs_fcheck = NULL;
}
-static int ocfs2_filecheck_sysfs_del(const char *devname)
+int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb)
{
- struct ocfs2_filecheck_sysfs_entry *p;
-
- spin_lock(&ocfs2_filecheck_sysfs_lock);
- list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
- if (!strcmp(p->fs_sb->s_id, devname)) {
- list_del(&p->fs_list);
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- ocfs2_filecheck_sysfs_free(p);
- return 0;
- }
- }
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- return 1;
-}
-
-static void
-ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
-{
- if (atomic_dec_and_test(&entry->fs_count))
- wake_up_var(&entry->fs_count);
-}
-
-static struct ocfs2_filecheck_sysfs_entry *
-ocfs2_filecheck_sysfs_get(const char *devname)
-{
- struct ocfs2_filecheck_sysfs_entry *p = NULL;
-
- spin_lock(&ocfs2_filecheck_sysfs_lock);
- list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
- if (!strcmp(p->fs_sb->s_id, devname)) {
- atomic_inc(&p->fs_count);
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- return p;
- }
- }
- spin_unlock(&ocfs2_filecheck_sysfs_lock);
- return NULL;
-}
-
-int ocfs2_filecheck_create_sysfs(struct super_block *sb)
-{
- int ret = 0;
- struct kset *device_kset = NULL;
- struct kset *fcheck_kset = NULL;
- struct ocfs2_filecheck *fcheck = NULL;
- struct ocfs2_filecheck_sysfs_entry *entry = NULL;
- struct attribute **attrs = NULL;
- struct attribute_group attrgp;
-
- if (!ocfs2_kset)
- return -ENOMEM;
-
- attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
- if (!attrs) {
- ret = -ENOMEM;
- goto error;
- } else {
- attrs[0] = &ocfs2_attr_filecheck_chk.attr;
- attrs[1] = &ocfs2_attr_filecheck_fix.attr;
- attrs[2] = &ocfs2_attr_filecheck_set.attr;
- attrs[3] = NULL;
- memset(&attrgp, 0, sizeof(attrgp));
- attrgp.attrs = attrs;
- }
+ int ret;
+ struct ocfs2_filecheck *fcheck;
+ struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent;
fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
- if (!fcheck) {
- ret = -ENOMEM;
- goto error;
- } else {
- INIT_LIST_HEAD(&fcheck->fc_head);
- spin_lock_init(&fcheck->fc_lock);
- fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
- fcheck->fc_size = 0;
- fcheck->fc_done = 0;
- }
-
- if (strlen(sb->s_id) <= 0) {
- mlog(ML_ERROR,
- "Cannot get device basename when create filecheck sysfs\n");
- ret = -ENODEV;
- goto error;
- }
-
- device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
- if (!device_kset) {
- ret = -ENOMEM;
- goto error;
- }
-
- fcheck_kset = kset_create_and_add("filecheck", NULL,
- &device_kset->kobj);
- if (!fcheck_kset) {
- ret = -ENOMEM;
- goto error;
- }
-
- ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
- if (ret)
- goto error;
+ if (!fcheck)
+ return -ENOMEM;
- entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
- if (!entry) {
- ret = -ENOMEM;
- goto error;
- } else {
- atomic_set(&entry->fs_count, 1);
- entry->fs_sb = sb;
- entry->fs_devicekset = device_kset;
- entry->fs_fcheckkset = fcheck_kset;
- entry->fs_fcheck = fcheck;
- ocfs2_filecheck_sysfs_add(entry);
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+
+ entry->fs_kobj.kset = osb->osb_dev_kset;
+ init_completion(&entry->fs_kobj_unregister);
+ ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck,
+ NULL, "filecheck");
+ if (ret) {
+ kfree(fcheck);
+ return ret;
}
- kfree(attrs);
+ entry->fs_fcheck = fcheck;
return 0;
-
-error:
- kfree(attrs);
- kfree(entry);
- kfree(fcheck);
- kset_unregister(fcheck_kset);
- kset_unregister(device_kset);
- return ret;
}
-int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb)
{
- return ocfs2_filecheck_sysfs_del(sb->s_id);
+ if (!osb->osb_fc_ent.fs_fcheck)
+ return;
+
+ kobject_del(&osb->osb_fc_ent.fs_kobj);
+ kobject_put(&osb->osb_fc_ent.fs_kobj);
+ wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister);
+ ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent);
}
static int
@@ -310,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
spin_lock(&ent->fs_fcheck->fc_lock);
if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
- mlog(ML_ERROR,
+ mlog(ML_NOTICE,
"Cannot set online file check maximum entry number "
"to %u due to too many pending entries(%u)\n",
len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
@@ -387,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
return 0;
}
-static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
@@ -395,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
unsigned int type;
struct ocfs2_filecheck_entry *p;
- struct ocfs2_filecheck_sysfs_entry *ent;
+ struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
return -EINVAL;
- ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
- if (!ent) {
- mlog(ML_ERROR,
- "Cannot get the corresponding entry via device basename %s\n",
- kobj->name);
- return -ENODEV;
- }
-
if (type == OCFS2_FILECHECK_TYPE_SET) {
spin_lock(&ent->fs_fcheck->fc_lock);
total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
@@ -441,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
spin_unlock(&ent->fs_fcheck->fc_lock);
exit:
- ocfs2_filecheck_sysfs_put(ent);
return total;
}
-static int
+static inline int
+ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned long ino)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (!p->fe_done) {
+ if (p->fe_ino == ino)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int
ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
{
struct ocfs2_filecheck_entry *p;
@@ -484,21 +408,21 @@ static void
ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
struct ocfs2_filecheck_entry *entry)
{
- entry->fe_done = 1;
spin_lock(&ent->fs_fcheck->fc_lock);
+ entry->fe_done = 1;
ent->fs_fcheck->fc_done++;
spin_unlock(&ent->fs_fcheck->fc_lock);
}
static unsigned int
-ocfs2_filecheck_handle(struct super_block *sb,
+ocfs2_filecheck_handle(struct ocfs2_super *osb,
unsigned long ino, unsigned int flags)
{
unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
struct inode *inode = NULL;
int rc;
- inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ inode = ocfs2_iget(osb, ino, flags, 0);
if (IS_ERR(inode)) {
rc = (int)(-(long)inode);
if (rc >= OCFS2_FILECHECK_ERR_START &&
@@ -516,11 +440,14 @@ static void
ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
struct ocfs2_filecheck_entry *entry)
{
+ struct ocfs2_super *osb = container_of(ent, struct ocfs2_super,
+ osb_fc_ent);
+
if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
- entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_status = ocfs2_filecheck_handle(osb,
entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
- entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_status = ocfs2_filecheck_handle(osb,
entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
else
entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
@@ -528,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
ocfs2_filecheck_done_entry(ent, entry);
}
-static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ ssize_t ret = 0;
struct ocfs2_filecheck_args args;
struct ocfs2_filecheck_entry *entry;
- struct ocfs2_filecheck_sysfs_entry *ent;
- ssize_t ret = 0;
+ struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
+ struct ocfs2_filecheck_sysfs_entry, fs_kobj);
if (count == 0)
return count;
- if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
- mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args))
return -EINVAL;
- }
-
- ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
- if (!ent) {
- mlog(ML_ERROR,
- "Cannot get the corresponding entry via device basename %s\n",
- kobj->parent->name);
- return -ENODEV;
- }
if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
@@ -565,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
}
spin_lock(&ent->fs_fcheck->fc_lock);
- if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
- (ent->fs_fcheck->fc_done == 0)) {
- mlog(ML_ERROR,
+ if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) {
+ ret = -EEXIST;
+ kfree(entry);
+ } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_NOTICE,
"Cannot do more file check "
"since file check queue(%u) is full now\n",
ent->fs_fcheck->fc_max);
- ret = -EBUSY;
+ ret = -EAGAIN;
kfree(entry);
} else {
if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
@@ -596,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
ocfs2_filecheck_handle_entry(ent, entry);
exit:
- ocfs2_filecheck_sysfs_put(ent);
return (!ret ? count : ret);
}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
index e5cd002a2c09..6a22ee79e8d0 100644
--- a/fs/ocfs2/filecheck.h
+++ b/fs/ocfs2/filecheck.h
@@ -43,7 +43,32 @@ enum {
#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
-int ocfs2_filecheck_create_sysfs(struct super_block *sb);
-int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* Finished entry count in list */
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */
+ struct kobject fs_kobj;
+ struct completion fs_kobj_unregister;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+
+int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb);
+void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb);
#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d51b80edd972..ddc3e9470c87 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1135,7 +1135,7 @@ static void ocfs2_clear_inode(struct inode *inode)
trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink);
- mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+ mlog_bug_on_msg(osb == NULL,
"Inode=%lu\n", inode->i_ino);
dquot_drop(inode);
@@ -1150,7 +1150,7 @@ static void ocfs2_clear_inode(struct inode *inode)
ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
- ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ ocfs2_resv_discard(&osb->osb_la_resmap,
&oi->ip_la_data_resv);
ocfs2_resv_init_once(&oi->ip_la_data_resv);
@@ -1160,7 +1160,7 @@ static void ocfs2_clear_inode(struct inode *inode)
* exception here are successfully wiped inodes - their
* metadata can now be considered to be part of the system
* inodes from which it came. */
- if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+ if (!(oi->ip_flags & OCFS2_INODE_DELETED))
ocfs2_checkpoint_inode(inode);
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
@@ -1223,7 +1223,7 @@ static void ocfs2_clear_inode(struct inode *inode)
* the journal is flushed before journal shutdown. Thus it is safe to
* have inodes get cleaned up after journal shutdown.
*/
- jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+ jbd2_journal_release_jbd_inode(osb->journal->j_journal,
&oi->ip_jinode);
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c801eddc4bf3..8dd6f703c819 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -525,7 +525,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
* these are used by the support functions here and in
* callers. */
inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
- OCFS2_I(inode)->ip_blkno = fe_blkno;
+ oi->ip_blkno = fe_blkno;
spin_lock(&osb->osb_lock);
inode->i_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
@@ -1186,8 +1186,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
trace_ocfs2_double_lock_end(
- (unsigned long long)OCFS2_I(inode1)->ip_blkno,
- (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+ (unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
bail:
if (status)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6867eef2e06b..4f86ac0027b5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -50,6 +50,8 @@
#include "reservations.h"
+#include "filecheck.h"
+
/* Caching of metadata buffers */
/* Most user visible OCFS2 inodes will have very few pieces of
@@ -472,6 +474,12 @@ struct ocfs2_super
* workqueue and schedule on our own.
*/
struct workqueue_struct *ocfs2_wq;
+
+ /* sysfs directory per partition */
+ struct kset *osb_dev_kset;
+
+ /* file check related stuff */
+ struct ocfs2_filecheck_sysfs_entry osb_fc_ent;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index e2a11aaece10..2ee76a90ba8f 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1311,11 +1311,11 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter);
DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
-DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter);
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
@@ -1467,7 +1467,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write,
__entry->saved_pos, __entry->count, __entry->wait)
);
-DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret);
/* End of trace events for fs/ocfs2/file.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ab156e35ec00..7869622af22a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -573,7 +573,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
BUG_ON(ocfs2_is_refcount_inode(inode));
trace_ocfs2_create_refcount_tree(
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)oi->ip_blkno);
ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
if (ret) {
@@ -3359,7 +3359,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
unsigned int ext_flags;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ if (!ocfs2_refcount_tree(osb)) {
return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
inode->i_ino);
}
@@ -3707,7 +3707,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
trace_ocfs2_add_refcount_flag(ref_blocks, credits);
if (ref_blocks) {
- ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_reserve_new_metadata_blocks(osb,
ref_blocks, &meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4250,10 +4250,11 @@ out:
static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
- int error;
+ int error, had_lock;
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
+ struct ocfs2_lock_holder oh;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
@@ -4295,6 +4296,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
goto out;
}
+ had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1,
+ &oh);
+ if (had_lock < 0) {
+ error = had_lock;
+ mlog_errno(error);
+ goto out;
+ }
+
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
@@ -4302,14 +4311,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
if (error)
mlog_errno(error);
}
-out:
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
}
+ ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock);
+out:
if (new_orphan_inode) {
/*
* We need to open_unlock the inode no matter whether we
@@ -4766,8 +4776,8 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
*bh2 = *bh1;
trace_ocfs2_double_lock_end(
- (unsigned long long)OCFS2_I(inode1)->ip_blkno,
- (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+ (unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
return 0;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d8f5f6ce99dc..f7c972fbed6a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -79,8 +79,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
}
-static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
-static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
@@ -387,7 +385,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
- bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ bg->bg_generation = cpu_to_le32(osb->fs_generation);
bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
@@ -1521,7 +1519,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
OCFS2_I(inode)->ip_clusters, max_bits);
}
- ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_block_group_find_clear_bits(osb,
group_bh, bits_wanted,
max_bits, res);
if (ret)
@@ -2626,53 +2624,6 @@ int ocfs2_release_clusters(handle_t *handle,
_ocfs2_clear_bit);
}
-static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
-{
- printk("Block Group:\n");
- printk("bg_signature: %s\n", bg->bg_signature);
- printk("bg_size: %u\n", bg->bg_size);
- printk("bg_bits: %u\n", bg->bg_bits);
- printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
- printk("bg_chain: %u\n", bg->bg_chain);
- printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
- printk("bg_next_group: %llu\n",
- (unsigned long long)bg->bg_next_group);
- printk("bg_parent_dinode: %llu\n",
- (unsigned long long)bg->bg_parent_dinode);
- printk("bg_blkno: %llu\n",
- (unsigned long long)bg->bg_blkno);
-}
-
-static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
-{
- int i;
-
- printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
- printk("i_signature: %s\n", fe->i_signature);
- printk("i_size: %llu\n",
- (unsigned long long)fe->i_size);
- printk("i_clusters: %u\n", fe->i_clusters);
- printk("i_generation: %u\n",
- le32_to_cpu(fe->i_generation));
- printk("id1.bitmap1.i_used: %u\n",
- le32_to_cpu(fe->id1.bitmap1.i_used));
- printk("id1.bitmap1.i_total: %u\n",
- le32_to_cpu(fe->id1.bitmap1.i_total));
- printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
- printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
- printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
- printk("id2.i_chain.cl_next_free_rec: %u\n",
- fe->id2.i_chain.cl_next_free_rec);
- for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
- printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
- fe->id2.i_chain.cl_recs[i].c_free);
- printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
- fe->id2.i_chain.cl_recs[i].c_total);
- printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
- (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
- }
-}
-
/*
* For a given allocation, determine which allocators will need to be
* accessed, and lock them, reserving the appropriate number of bits.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ffa4952d432b..3415e0b09398 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -423,10 +423,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
ocfs2_schedule_truncate_log_flush(osb, 0);
}
- if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+ if (jbd2_journal_start_commit(osb->journal->j_journal,
&target)) {
if (wait)
- jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+ jbd2_log_wait_commit(osb->journal->j_journal,
target);
}
return 0;
@@ -1161,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_complete_mount_recovery(osb);
+ osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
+ &ocfs2_kset->kobj);
+ if (!osb->osb_dev_kset) {
+ status = -ENOMEM;
+ mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
+ goto read_super_error;
+ }
+
+ /* Create filecheck sysfs related directories/files at
+ * /sys/fs/ocfs2/<devname>/filecheck */
+ if (ocfs2_filecheck_create_sysfs(osb)) {
+ status = -ENOMEM;
+ mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
+ "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
+ goto read_super_error;
+ }
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
@@ -1199,9 +1216,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
- /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
- ocfs2_filecheck_create_sysfs(sb);
-
return status;
read_super_error:
@@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
- ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1768,12 +1781,9 @@ static int ocfs2_initialize_mem_caches(void)
NULL);
if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
!ocfs2_qf_chunk_cachep) {
- if (ocfs2_inode_cachep)
- kmem_cache_destroy(ocfs2_inode_cachep);
- if (ocfs2_dquot_cachep)
- kmem_cache_destroy(ocfs2_dquot_cachep);
- if (ocfs2_qf_chunk_cachep)
- kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ kmem_cache_destroy(ocfs2_inode_cachep);
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
return -ENOMEM;
}
@@ -1787,16 +1797,13 @@ static void ocfs2_free_mem_caches(void)
* destroy cache.
*/
rcu_barrier();
- if (ocfs2_inode_cachep)
- kmem_cache_destroy(ocfs2_inode_cachep);
+ kmem_cache_destroy(ocfs2_inode_cachep);
ocfs2_inode_cachep = NULL;
- if (ocfs2_dquot_cachep)
- kmem_cache_destroy(ocfs2_dquot_cachep);
+ kmem_cache_destroy(ocfs2_dquot_cachep);
ocfs2_dquot_cachep = NULL;
- if (ocfs2_qf_chunk_cachep)
- kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
ocfs2_qf_chunk_cachep = NULL;
}
@@ -1899,6 +1906,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
osb = OCFS2_SB(sb);
BUG_ON(!osb);
+ /* Remove file check sysfs related directores/files,
+ * and wait for the pending file check operations */
+ ocfs2_filecheck_remove_sysfs(osb);
+
+ kset_unregister(osb->osb_dev_kset);
+
debugfs_remove(osb->osb_ctxt);
/* Orphan scan should be stopped as early as possible */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 82e17b076ce7..78f09c76ab3c 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -633,6 +633,5 @@ int __init init_ocfs2_uptodate_cache(void)
void exit_ocfs2_uptodate_cache(void)
{
- if (ocfs2_uptodate_cachep)
- kmem_cache_destroy(ocfs2_uptodate_cachep);
+ kmem_cache_destroy(ocfs2_uptodate_cachep);
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c261c1dfd374..3a24ce3deb01 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3564,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode,
.not_found = -ENODATA,
};
- if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ if (!ocfs2_supports_xattr(osb))
return -EOPNOTSUPP;
/*
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b7146526afff..4bee3a72b9f3 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -305,11 +305,10 @@ static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
ino_t ino = be64_to_cpu(oi->i_head.h_self);
brelse(bh);
inode = omfs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ } else if (bh != ERR_PTR(-ENOENT)) {
+ inode = ERR_CAST(bh);
}
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
/* sanity check block's self pointer */
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 2200662a9bf1..607092f367ad 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,8 +256,7 @@ found:
break;
}
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 480ea059a680..10587413b20e 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -9,7 +9,6 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
#include <linux/posix_acl_xattr.h>
-#include <linux/fs_struct.h>
struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
{
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index b03057afac2a..66369ec90020 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -463,11 +463,10 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
goto wakeup;
- op->downcall.trailer_buf = vmalloc(op->downcall.trailer_size);
+ op->downcall.trailer_buf = vzalloc(op->downcall.trailer_size);
if (!op->downcall.trailer_buf)
goto Enomem;
- memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
if (!copy_from_iter_full(op->downcall.trailer_buf,
op->downcall.trailer_size, iter)) {
gossip_err("%s: failed to copy trailer.\n", __func__);
@@ -779,9 +778,35 @@ static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
#endif /* CONFIG_COMPAT is in .config */
+static __poll_t orangefs_devreq_poll(struct file *file,
+ struct poll_table_struct *poll_table)
+{
+ __poll_t poll_revent_mask = 0;
+
+ poll_wait(file, &orangefs_request_list_waitq, poll_table);
+
+ if (!list_empty(&orangefs_request_list))
+ poll_revent_mask |= EPOLLIN;
+ return poll_revent_mask;
+}
+
/* the assigned character device major number */
static int orangefs_dev_major;
+static const struct file_operations orangefs_devreq_file_operations = {
+ .owner = THIS_MODULE,
+ .read = orangefs_devreq_read,
+ .write_iter = orangefs_devreq_write_iter,
+ .open = orangefs_devreq_open,
+ .release = orangefs_devreq_release,
+ .unlocked_ioctl = orangefs_devreq_ioctl,
+
+#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
+ .compat_ioctl = orangefs_devreq_compat_ioctl,
+#endif
+ .poll = orangefs_devreq_poll
+};
+
/*
* Initialize orangefs device specific state:
* Must be called at module load time only
@@ -814,29 +839,3 @@ void orangefs_dev_cleanup(void)
"*** /dev/%s character device unregistered ***\n",
ORANGEFS_REQDEVICE_NAME);
}
-
-static __poll_t orangefs_devreq_poll(struct file *file,
- struct poll_table_struct *poll_table)
-{
- __poll_t poll_revent_mask = 0;
-
- poll_wait(file, &orangefs_request_list_waitq, poll_table);
-
- if (!list_empty(&orangefs_request_list))
- poll_revent_mask |= EPOLLIN;
- return poll_revent_mask;
-}
-
-const struct file_operations orangefs_devreq_file_operations = {
- .owner = THIS_MODULE,
- .read = orangefs_devreq_read,
- .write_iter = orangefs_devreq_write_iter,
- .open = orangefs_devreq_open,
- .release = orangefs_devreq_release,
- .unlocked_ioctl = orangefs_devreq_ioctl,
-
-#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
- .compat_ioctl = orangefs_devreq_compat_ioctl,
-#endif
- .poll = orangefs_devreq_poll
-};
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 0d228cd087e6..26358efbf794 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -42,70 +42,6 @@ static int flush_racache(struct inode *inode)
}
/*
- * Copy to client-core's address space from the buffers specified
- * by the iovec upto total_size bytes.
- * NOTE: the iovector can either contain addresses which
- * can futher be kernel-space or user-space addresses.
- * or it can pointers to struct page's
- */
-static int precopy_buffers(int buffer_index,
- struct iov_iter *iter,
- size_t total_size)
-{
- int ret = 0;
- /*
- * copy data from application/kernel by pulling it out
- * of the iovec.
- */
-
-
- if (total_size) {
- ret = orangefs_bufmap_copy_from_iovec(iter,
- buffer_index,
- total_size);
- if (ret < 0)
- gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
- __func__,
- (long)ret);
- }
-
- if (ret < 0)
- gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
- __func__,
- (long)ret);
- return ret;
-}
-
-/*
- * Copy from client-core's address space to the buffers specified
- * by the iovec upto total_size bytes.
- * NOTE: the iovector can either contain addresses which
- * can futher be kernel-space or user-space addresses.
- * or it can pointers to struct page's
- */
-static int postcopy_buffers(int buffer_index,
- struct iov_iter *iter,
- size_t total_size)
-{
- int ret = 0;
- /*
- * copy data to application/kernel by pushing it out to
- * the iovec. NOTE; target buffers can be addresses or
- * struct page pointers.
- */
- if (total_size) {
- ret = orangefs_bufmap_copy_to_iovec(iter,
- buffer_index,
- total_size);
- if (ret < 0)
- gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
- __func__,
- (long)ret);
- }
- return ret;
-}
-
-/*
* Post and wait for the I/O upcall to finish
*/
static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
@@ -157,14 +93,15 @@ populate_shared_memory:
total_size);
/*
* Stage 1: copy the buffers into client-core's address space
- * precopy_buffers only pertains to writes.
*/
- if (type == ORANGEFS_IO_WRITE) {
- ret = precopy_buffers(buffer_index,
- iter,
- total_size);
- if (ret < 0)
+ if (type == ORANGEFS_IO_WRITE && total_size) {
+ ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index,
+ total_size);
+ if (ret < 0) {
+ gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+ __func__, (long)ret);
goto out;
+ }
}
gossip_debug(GOSSIP_FILE_DEBUG,
@@ -260,14 +197,20 @@ populate_shared_memory:
/*
* Stage 3: Post copy buffers from client-core's address space
- * postcopy_buffers only pertains to reads.
*/
- if (type == ORANGEFS_IO_READ) {
- ret = postcopy_buffers(buffer_index,
- iter,
- new_op->downcall.resp.io.amt_complete);
- if (ret < 0)
+ if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) {
+ /*
+ * NOTE: the iovector can either contain addresses which
+ * can futher be kernel-space or user-space addresses.
+ * or it can pointers to struct page's
+ */
+ ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
+ new_op->downcall.resp.io.amt_complete);
+ if (ret < 0) {
+ gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
+ __func__, (long)ret);
goto out;
+ }
}
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): Amount %s, returned by the sys-io call:%d\n",
@@ -585,6 +528,28 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar
return ret;
}
+static int orangefs_fault(struct vm_fault *vmf)
+{
+ struct file *file = vmf->vma->vm_file;
+ int rc;
+ rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
+ if (rc == -ESTALE)
+ rc = -EIO;
+ if (rc) {
+ gossip_err("%s: orangefs_inode_getattr failed, "
+ "rc:%d:.\n", __func__, rc);
+ return rc;
+ }
+ return filemap_fault(vmf);
+}
+
+const struct vm_operations_struct orangefs_file_vm_ops = {
+ .fault = orangefs_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+};
+
/*
* Memory map a region of a file.
*/
@@ -596,12 +561,16 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
(char *)file->f_path.dentry->d_name.name :
(char *)"Unknown"));
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+
/* set the sequential readahead hint */
vma->vm_flags |= VM_SEQ_READ;
vma->vm_flags &= ~VM_RAND_READ;
- /* Use readonly mmap since we cannot support writable maps. */
- return generic_file_readonly_mmap(file, vma);
+ file_accessed(file);
+ vma->vm_ops = &orangefs_file_vm_ops;
+ return 0;
}
#define mapping_nrpages(idata) ((idata)->nrpages)
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index fe1d705ad91f..79c61da8b1bc 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -138,7 +138,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
}
/** ORANGEFS2 implementation of address space operations */
-const struct address_space_operations orangefs_address_operations = {
+static const struct address_space_operations orangefs_address_operations = {
.readpage = orangefs_readpage,
.readpages = orangefs_readpages,
.invalidatepage = orangefs_invalidatepage,
@@ -307,7 +307,7 @@ int orangefs_update_time(struct inode *inode, struct timespec *time, int flags)
}
/* ORANGEDS2 implementation of VFS inode operations for files */
-const struct inode_operations orangefs_file_inode_operations = {
+static const struct inode_operations orangefs_file_inode_operations = {
.get_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 6e3134e6d98a..365cd73d9109 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -75,8 +75,7 @@ static int orangefs_create(struct inode *dir,
get_khandle_from_ino(inode),
dentry);
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
@@ -111,7 +110,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
struct inode *inode;
- struct dentry *res;
int ret = -EINVAL;
/*
@@ -159,65 +157,18 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
new_op->downcall.resp.lookup.refn.fs_id,
ret);
- if (ret < 0) {
- if (ret == -ENOENT) {
- /*
- * if no inode was found, add a negative dentry to
- * dcache anyway; if we don't, we don't hold expected
- * lookup semantics and we most noticeably break
- * during directory renames.
- *
- * however, if the operation failed or exited, do not
- * add the dentry (e.g. in the case that a touch is
- * issued on a file that already exists that was
- * interrupted during this lookup -- no need to add
- * another negative dentry for an existing file)
- */
-
- gossip_debug(GOSSIP_NAME_DEBUG,
- "orangefs_lookup: Adding *negative* dentry "
- "%p for %pd\n",
- dentry,
- dentry);
-
- d_add(dentry, NULL);
- res = NULL;
- goto out;
- }
-
+ if (ret >= 0) {
+ orangefs_set_timeout(dentry);
+ inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+ } else if (ret == -ENOENT) {
+ inode = NULL;
+ } else {
/* must be a non-recoverable error */
- res = ERR_PTR(ret);
- goto out;
- }
-
- orangefs_set_timeout(dentry);
-
- inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
- if (IS_ERR(inode)) {
- gossip_debug(GOSSIP_NAME_DEBUG,
- "error %ld from iget\n", PTR_ERR(inode));
- res = ERR_CAST(inode);
- goto out;
+ inode = ERR_PTR(ret);
}
- gossip_debug(GOSSIP_NAME_DEBUG,
- "%s:%s:%d "
- "Found good inode [%lu] with count [%d]\n",
- __FILE__,
- __func__,
- __LINE__,
- inode->i_ino,
- (int)atomic_read(&inode->i_count));
-
- /* update dentry/inode pair into dcache */
- res = d_splice_alias(inode, dentry);
-
- gossip_debug(GOSSIP_NAME_DEBUG,
- "Lookup success (inode ct = %d)\n",
- (int)atomic_read(&inode->i_count));
-out:
op_release(new_op);
- return res;
+ return d_splice_alias(inode, dentry);
}
/* return 0 on success; non-zero otherwise */
@@ -332,8 +283,7 @@ static int orangefs_symlink(struct inode *dir,
"Assigned symlink inode new number of %pU\n",
get_khandle_from_ino(inode));
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
@@ -402,8 +352,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
"Assigned dir inode new number of %pU\n",
get_khandle_from_ino(inode));
- d_instantiate(dentry, inode);
- unlock_new_inode(inode);
+ d_instantiate_new(dentry, inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 59f444dced9b..4f927023d095 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -71,9 +71,9 @@ static void put(struct slot_map *m, int slot)
spin_lock(&m->q.lock);
__clear_bit(slot, m->map);
v = ++m->c;
- if (unlikely(v == 1)) /* no free slots -> one free slot */
+ if (v > 0)
wake_up_locked(&m->q);
- else if (unlikely(v == -1)) /* finished dying */
+ if (unlikely(v == -1)) /* finished dying */
wake_up_all_locked(&m->q);
spin_unlock(&m->q.lock);
}
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index c7db56a31b92..6e079d4230d0 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -43,12 +43,6 @@
#define GOSSIP_MAX_NR 16
#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
-/*function prototypes*/
-__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging);
-__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging);
-char *ORANGEFS_debug_mask_to_eventlog(__u64 mask);
-char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask);
-
/* a private internal type */
struct __keyword_mask_s {
const char *keyword;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index eebbaece85ef..c29bb0ebc6bb 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -65,11 +65,7 @@
#define ORANGEFS_REQDEVICE_NAME "pvfs2-req"
#define ORANGEFS_DEVREQ_MAGIC 0x20030529
-#define ORANGEFS_LINK_MAX 0x000000FF
#define ORANGEFS_PURGE_RETRY_COUNT 0x00000005
-#define ORANGEFS_MAX_NUM_OPTIONS 0x00000004
-#define ORANGEFS_MAX_MOUNT_OPT_LEN 0x00000080
-#define ORANGEFS_MAX_FSKEY_LEN 64
#define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \
sizeof(__u64) + sizeof(struct orangefs_upcall_s))
@@ -113,15 +109,6 @@ extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
/*
- * Redefine xtvec structure so that we could move helper functions out of
- * the define
- */
-struct xtvec {
- __kernel_off_t xtv_off; /* must be off_t */
- __kernel_size_t xtv_len; /* must be size_t */
-};
-
-/*
* orangefs data structures
*/
struct orangefs_kernel_op_s {
@@ -224,39 +211,6 @@ struct orangefs_sb_info_s {
struct list_head list;
};
-/*
- * structure that holds the state of any async I/O operation issued
- * through the VFS. Needed especially to handle cancellation requests
- * or even completion notification so that the VFS client-side daemon
- * can free up its vfs_request slots.
- */
-struct orangefs_kiocb_s {
- /* the pointer to the task that initiated the AIO */
- struct task_struct *tsk;
-
- /* pointer to the kiocb that kicked this operation */
- struct kiocb *kiocb;
-
- /* buffer index that was used for the I/O */
- struct orangefs_bufmap *bufmap;
- int buffer_index;
-
- /* orangefs kernel operation type */
- struct orangefs_kernel_op_s *op;
-
- /* set to indicate the type of the operation */
- int rw;
-
- /* file offset */
- loff_t offset;
-
- /* and the count in bytes */
- size_t bytes_to_be_copied;
-
- ssize_t bytes_copied;
- int needs_cleanup;
-};
-
struct orangefs_stats {
unsigned long cache_hits;
unsigned long cache_misses;
@@ -305,21 +259,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
return &(ORANGEFS_I(inode)->refn.khandle);
}
-static inline ino_t get_ino_from_khandle(struct inode *inode)
-{
- struct orangefs_khandle *khandle;
- ino_t ino;
-
- khandle = get_khandle_from_ino(inode);
- ino = orangefs_khandle_to_ino(khandle);
- return ino;
-}
-
-static inline ino_t get_parent_ino_from_dentry(struct dentry *dentry)
-{
- return get_ino_from_khandle(dentry->d_parent->d_inode);
-}
-
static inline int is_root_handle(struct inode *inode)
{
gossip_debug(GOSSIP_DCACHE_DEBUG,
@@ -391,7 +330,6 @@ void fsid_key_table_finalize(void);
/*
* defined in inode.c
*/
-__u32 convert_to_orangefs_mask(unsigned long lite_mask);
struct inode *orangefs_new_inode(struct super_block *sb,
struct inode *dir,
int mode,
@@ -410,17 +348,6 @@ int orangefs_update_time(struct inode *, struct timespec *, int);
/*
* defined in xattr.c
*/
-int orangefs_setxattr(struct dentry *dentry,
- const char *name,
- const void *value,
- size_t size,
- int flags);
-
-ssize_t orangefs_getxattr(struct dentry *dentry,
- const char *name,
- void *buffer,
- size_t size);
-
ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/*
@@ -467,8 +394,6 @@ int orangefs_inode_check_changed(struct inode *inode);
int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
-int orangefs_unmount_sb(struct super_block *sb);
-
bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
int orangefs_normalize_to_errno(__s32 error_code);
@@ -487,16 +412,11 @@ extern struct list_head *orangefs_htable_ops_in_progress;
extern spinlock_t orangefs_htable_ops_in_progress_lock;
extern int hash_table_size;
-extern const struct address_space_operations orangefs_address_operations;
-extern const struct inode_operations orangefs_file_inode_operations;
extern const struct file_operations orangefs_file_operations;
extern const struct inode_operations orangefs_symlink_inode_operations;
extern const struct inode_operations orangefs_dir_inode_operations;
extern const struct file_operations orangefs_dir_operations;
extern const struct dentry_operations orangefs_dentry_operations;
-extern const struct file_operations orangefs_devreq_file_operations;
-
-extern wait_queue_head_t orangefs_bufmap_init_waitq;
/*
* misc convenience macros
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index dc6e3e6269c3..61ee8d64c842 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -5,11 +5,6 @@
#include <linux/slab.h>
#include <linux/ioctl.h>
-/* pvfs2-config.h ***********************************************************/
-#define ORANGEFS_VERSION_MAJOR 2
-#define ORANGEFS_VERSION_MINOR 9
-#define ORANGEFS_VERSION_SUB 0
-
/* khandle stuff ***********************************************************/
/*
@@ -70,16 +65,6 @@ static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh,
}
/* pvfs2-types.h ************************************************************/
-typedef __u32 ORANGEFS_uid;
-typedef __u32 ORANGEFS_gid;
-typedef __s32 ORANGEFS_fs_id;
-typedef __u32 ORANGEFS_permissions;
-typedef __u64 ORANGEFS_time;
-typedef __s64 ORANGEFS_size;
-typedef __u64 ORANGEFS_flags;
-typedef __u64 ORANGEFS_ds_position;
-typedef __s32 ORANGEFS_error;
-typedef __s64 ORANGEFS_offset;
#define ORANGEFS_SUPER_MAGIC 0x20030528
@@ -145,7 +130,6 @@ typedef __s64 ORANGEFS_offset;
#define ORANGEFS_APPEND_FL FS_APPEND_FL
#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
#define ORANGEFS_MIRROR_FL 0x01000000ULL
-#define ORANGEFS_O_EXECUTE (1 << 0)
#define ORANGEFS_FS_ID_NULL ((__s32)0)
#define ORANGEFS_ATTR_SYS_UID (1 << 0)
#define ORANGEFS_ATTR_SYS_GID (1 << 1)
@@ -229,35 +213,6 @@ enum orangefs_ds_type {
ORANGEFS_TYPE_INTERNAL = (1 << 5) /* for the server's private use */
};
-/*
- * ORANGEFS_certificate simply stores a buffer with the buffer size.
- * The buffer can be converted to an OpenSSL X509 struct for use.
- */
-struct ORANGEFS_certificate {
- __u32 buf_size;
- unsigned char *buf;
-};
-
-/*
- * A credential identifies a user and is signed by the client/user
- * private key.
- */
-struct ORANGEFS_credential {
- __u32 userid; /* user id */
- __u32 num_groups; /* length of group_array */
- __u32 *group_array; /* groups for which the user is a member */
- char *issuer; /* alias of the issuing server */
- __u64 timeout; /* seconds after epoch to time out */
- __u32 sig_size; /* length of the signature in bytes */
- unsigned char *signature; /* digital signature */
- struct ORANGEFS_certificate certificate; /* user certificate buffer */
-};
-#define extra_size_ORANGEFS_credential (ORANGEFS_REQ_LIMIT_GROUPS * \
- sizeof(__u32) + \
- ORANGEFS_REQ_LIMIT_ISSUER + \
- ORANGEFS_REQ_LIMIT_SIGNATURE + \
- extra_size_ORANGEFS_certificate)
-
/* This structure is used by the VFS-client interaction alone */
struct ORANGEFS_keyval_pair {
char key[ORANGEFS_MAX_XATTR_NAMELEN];
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 3ae5fdba0225..10796d3fe27d 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -579,6 +579,11 @@ void orangefs_kill_sb(struct super_block *sb)
/* provided sb cleanup */
kill_anon_super(sb);
+ if (!ORANGEFS_SB(sb)) {
+ mutex_lock(&orangefs_request_mutex);
+ mutex_unlock(&orangefs_request_mutex);
+ return;
+ }
/*
* issue the unmount to userspace to tell it to remove the
* dynamic mount info it has for this superblock
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index ce6ff5a0a6e4..17032631c5cf 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -86,3 +86,20 @@ config OVERLAY_FS_NFS_EXPORT
case basis with the "nfs_export=on" mount option.
Say N unless you fully understand the consequences.
+
+config OVERLAY_FS_XINO_AUTO
+ bool "Overlayfs: auto enable inode number mapping"
+ default n
+ depends on OVERLAY_FS
+ help
+ If this config option is enabled then overlay filesystems will use
+ unused high bits in undelying filesystem inode numbers to map all
+ inodes to a unified address space. The mapped 64bit inode numbers
+ might not be compatible with applications that expect 32bit inodes.
+
+ If compatibility with applications that expect 32bit inodes is not an
+ issue, then it is safe and recommended to say Y here.
+
+ For more information, see Documentation/filesystems/overlayfs.txt
+
+ If unsure, say N.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d855f508fa20..8bede0742619 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -232,7 +232,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
return err;
}
-struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper)
+struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
{
struct ovl_fh *fh;
int fh_type, fh_len, dwords;
@@ -300,7 +300,7 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
* up and a pure upper inode.
*/
if (ovl_can_decode_fh(lower->d_sb)) {
- fh = ovl_encode_fh(lower, false);
+ fh = ovl_encode_real_fh(lower, false);
if (IS_ERR(fh))
return PTR_ERR(fh);
}
@@ -321,7 +321,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
const struct ovl_fh *fh;
int err;
- fh = ovl_encode_fh(upper, true);
+ fh = ovl_encode_real_fh(upper, true);
if (IS_ERR(fh))
return PTR_ERR(fh);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 87bd4148f4fb..425a94672300 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -228,8 +228,8 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
goto fail;
/* Encode an upper or lower file handle */
- fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) :
- ovl_dentry_upper(dentry), !enc_lower);
+ fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
+ ovl_dentry_upper(dentry), !enc_lower);
err = PTR_ERR(fh);
if (IS_ERR(fh))
goto fail;
@@ -267,8 +267,8 @@ static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len)
return OVL_FILEID;
}
-static int ovl_encode_inode_fh(struct inode *inode, u32 *fid, int *max_len,
- struct inode *parent)
+static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
+ struct inode *parent)
{
struct dentry *dentry;
int type;
@@ -305,15 +305,12 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
if (d_is_dir(upper ?: lower))
return ERR_PTR(-EIO);
- inode = ovl_get_inode(sb, dget(upper), lower, index, !!lower);
+ inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower);
if (IS_ERR(inode)) {
dput(upper);
return ERR_CAST(inode);
}
- if (index)
- ovl_set_flag(OVL_INDEX, inode);
-
dentry = d_find_any_alias(inode);
if (!dentry) {
dentry = d_alloc_anon(inode->i_sb);
@@ -685,7 +682,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
if (!ofs->upper_mnt)
return ERR_PTR(-EACCES);
- upper = ovl_decode_fh(fh, ofs->upper_mnt);
+ upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
if (IS_ERR_OR_NULL(upper))
return upper;
@@ -703,25 +700,39 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
struct ovl_path *stack = &origin;
struct dentry *dentry = NULL;
struct dentry *index = NULL;
- struct inode *inode = NULL;
- bool is_deleted = false;
+ struct inode *inode;
int err;
- /* First lookup indexed upper by fh */
+ /* First lookup overlay inode in inode cache by origin fh */
+ err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack);
+ if (err)
+ return ERR_PTR(err);
+
+ if (!d_is_dir(origin.dentry) ||
+ !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
+ inode = ovl_lookup_inode(sb, origin.dentry, false);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_err;
+ if (inode) {
+ dentry = d_find_any_alias(inode);
+ iput(inode);
+ if (dentry)
+ goto out;
+ }
+ }
+
+ /* Then lookup indexed upper/whiteout by origin fh */
if (ofs->indexdir) {
index = ovl_get_index_fh(ofs, fh);
err = PTR_ERR(index);
if (IS_ERR(index)) {
- if (err != -ESTALE)
- return ERR_PTR(err);
-
- /* Found a whiteout index - treat as deleted inode */
- is_deleted = true;
index = NULL;
+ goto out_err;
}
}
- /* Then try to get upper dir by index */
+ /* Then try to get a connected upper dir by index */
if (index && d_is_dir(index)) {
struct dentry *upper = ovl_index_upper(ofs, index);
@@ -734,24 +745,19 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
goto out;
}
- /* Then lookup origin by fh */
- err = ovl_check_origin_fh(ofs, fh, NULL, &stack);
- if (err) {
- goto out_err;
- } else if (index) {
- err = ovl_verify_origin(index, origin.dentry, false);
+ /* Otherwise, get a connected non-upper dir or disconnected non-dir */
+ if (d_is_dir(origin.dentry) &&
+ (origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
+ dput(origin.dentry);
+ origin.dentry = NULL;
+ err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
if (err)
goto out_err;
- } else if (is_deleted) {
- /* Lookup deleted non-dir by origin inode */
- if (!d_is_dir(origin.dentry))
- inode = ovl_lookup_inode(sb, origin.dentry, false);
- err = -ESTALE;
- if (!inode || atomic_read(&inode->i_count) == 1)
+ }
+ if (index) {
+ err = ovl_verify_origin(index, origin.dentry, false);
+ if (err)
goto out_err;
-
- /* Deleted but still open? */
- index = dget(ovl_i_dentry_upper(inode));
}
dentry = ovl_get_dentry(sb, NULL, &origin, index);
@@ -759,7 +765,6 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
out:
dput(origin.dentry);
dput(index);
- iput(inode);
return dentry;
out_err:
@@ -829,7 +834,7 @@ static struct dentry *ovl_get_parent(struct dentry *dentry)
}
const struct export_operations ovl_export_operations = {
- .encode_fh = ovl_encode_inode_fh,
+ .encode_fh = ovl_encode_fh,
.fh_to_dentry = ovl_fh_to_dentry,
.fh_to_parent = ovl_fh_to_parent,
.get_name = ovl_get_name,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 3b1bd469accd..6e3815fb006b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -16,13 +16,6 @@
#include "overlayfs.h"
-static dev_t ovl_get_pseudo_dev(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- return oe->lowerstack[0].layer->pseudo_dev;
-}
-
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
@@ -66,6 +59,69 @@ out:
return err;
}
+static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
+ struct ovl_layer *lower_layer)
+{
+ bool samefs = ovl_same_sb(dentry->d_sb);
+ unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+
+ if (samefs) {
+ /*
+ * When all layers are on the same fs, all real inode
+ * number are unique, so we use the overlay st_dev,
+ * which is friendly to du -x.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ return 0;
+ } else if (xinobits) {
+ unsigned int shift = 64 - xinobits;
+ /*
+ * All inode numbers of underlying fs should not be using the
+ * high xinobits, so we use high xinobits to partition the
+ * overlay st_ino address space. The high bits holds the fsid
+ * (upper fsid is 0). This way overlay inode numbers are unique
+ * and all inodes use overlay st_dev. Inode numbers are also
+ * persistent for a given layer configuration.
+ */
+ if (stat->ino >> shift) {
+ pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ dentry, stat->ino, xinobits);
+ } else {
+ if (lower_layer)
+ stat->ino |= ((u64)lower_layer->fsid) << shift;
+
+ stat->dev = dentry->d_sb->s_dev;
+ return 0;
+ }
+ }
+
+ /* The inode could not be mapped to a unified st_ino address space */
+ if (S_ISDIR(dentry->d_inode->i_mode)) {
+ /*
+ * Always use the overlay st_dev for directories, so 'find
+ * -xdev' will scan the entire overlay mount and won't cross the
+ * overlay mount boundaries.
+ *
+ * If not all layers are on the same fs the pair {real st_ino;
+ * overlay st_dev} is not unique, so use the non persistent
+ * overlay st_ino for directories.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+ } else if (lower_layer && lower_layer->fsid) {
+ /*
+ * For non-samefs setup, if we cannot map all layers st_ino
+ * to a unified address space, we need to make sure that st_dev
+ * is unique per lower fs. Upper layer uses real st_dev and
+ * lower layers use the unique anonymous bdev assigned to the
+ * lower fs.
+ */
+ stat->dev = lower_layer->fs->pseudo_dev;
+ }
+
+ return 0;
+}
+
int ovl_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
@@ -75,6 +131,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
const struct cred *old_cred;
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
bool samefs = ovl_same_sb(dentry->d_sb);
+ struct ovl_layer *lower_layer = NULL;
int err;
type = ovl_path_real(dentry, &realpath);
@@ -84,14 +141,18 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
goto out;
/*
- * For non-dir or same fs, we use st_ino of the copy up origin, if we
- * know it. This guaranties constant st_dev/st_ino across copy up.
+ * For non-dir or same fs, we use st_ino of the copy up origin.
+ * This guaranties constant st_dev/st_ino across copy up.
+ * With xino feature and non-samefs, we use st_ino of the copy up
+ * origin masked with high bits that represent the layer id.
*
- * If filesystem supports NFS export ops, this also guaranties
+ * If lower filesystem supports NFS file handles, this also guaranties
* persistent st_ino across mount cycle.
*/
- if (!is_dir || samefs) {
- if (OVL_TYPE_ORIGIN(type)) {
+ if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
+ if (!OVL_TYPE_UPPER(type)) {
+ lower_layer = ovl_layer_lower(dentry);
+ } else if (OVL_TYPE_ORIGIN(type)) {
struct kstat lowerstat;
u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
@@ -118,43 +179,17 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
*/
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
(!ovl_verify_lower(dentry->d_sb) &&
- (is_dir || lowerstat.nlink == 1)))
+ (is_dir || lowerstat.nlink == 1))) {
stat->ino = lowerstat.ino;
-
- if (samefs)
- WARN_ON_ONCE(stat->dev != lowerstat.dev);
- else
- stat->dev = ovl_get_pseudo_dev(dentry);
- }
- if (samefs) {
- /*
- * When all layers are on the same fs, all real inode
- * number are unique, so we use the overlay st_dev,
- * which is friendly to du -x.
- */
- stat->dev = dentry->d_sb->s_dev;
- } else if (!OVL_TYPE_UPPER(type)) {
- /*
- * For non-samefs setup, to make sure that st_dev/st_ino
- * pair is unique across the system, we use a unique
- * anonymous st_dev for lower layer inode.
- */
- stat->dev = ovl_get_pseudo_dev(dentry);
+ lower_layer = ovl_layer_lower(dentry);
+ }
}
- } else {
- /*
- * Always use the overlay st_dev for directories, so 'find
- * -xdev' will scan the entire overlay mount and won't cross the
- * overlay mount boundaries.
- *
- * If not all layers are on the same fs the pair {real st_ino;
- * overlay st_dev} is not unique, so use the non persistent
- * overlay st_ino for directories.
- */
- stat->dev = dentry->d_sb->s_dev;
- stat->ino = dentry->d_inode->i_ino;
}
+ err = ovl_map_dev_ino(dentry, stat, lower_layer);
+ if (err)
+ goto out;
+
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
@@ -383,24 +418,18 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
int ovl_update_time(struct inode *inode, struct timespec *ts, int flags)
{
- struct dentry *alias;
- struct path upperpath;
-
- if (!(flags & S_ATIME))
- return 0;
-
- alias = d_find_any_alias(inode);
- if (!alias)
- return 0;
-
- ovl_path_upper(alias, &upperpath);
- if (upperpath.dentry) {
- touch_atime(&upperpath);
- inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+ if (flags & S_ATIME) {
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+ struct path upperpath = {
+ .mnt = ofs->upper_mnt,
+ .dentry = ovl_upperdentry_dereference(OVL_I(inode)),
+ };
+
+ if (upperpath.dentry) {
+ touch_atime(&upperpath);
+ inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+ }
}
-
- dput(alias);
-
return 0;
}
@@ -459,9 +488,27 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
#endif
}
-static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
+ unsigned long ino, int fsid)
{
- inode->i_ino = get_next_ino();
+ int xinobits = ovl_xino_bits(inode->i_sb);
+
+ /*
+ * When NFS export is enabled and d_ino is consistent with st_ino
+ * (samefs or i_ino has enough bits to encode layer), set the same
+ * value used for d_ino to i_ino, because nfsd readdirplus compares
+ * d_ino values to i_ino values of child entries. When called from
+ * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
+ * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
+ */
+ if (inode->i_sb->s_export_op &&
+ (ovl_same_sb(inode->i_sb) || xinobits)) {
+ inode->i_ino = ino;
+ if (xinobits && fsid && !(ino >> (64 - xinobits)))
+ inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
+ } else {
+ inode->i_ino = get_next_ino();
+ }
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
@@ -597,7 +644,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
inode = new_inode(sb);
if (inode)
- ovl_fill_inode(inode, mode, rdev);
+ ovl_fill_inode(inode, mode, rdev, 0, 0);
return inode;
}
@@ -703,13 +750,16 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
}
struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *index,
+ struct ovl_path *lowerpath, struct dentry *index,
unsigned int numlower)
{
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
+ struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index);
+ int fsid = bylower ? lowerpath->layer->fsid : 0;
bool is_dir;
+ unsigned long ino = 0;
if (!realinode)
realinode = d_inode(lowerdentry);
@@ -748,18 +798,22 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
if (!is_dir)
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
set_nlink(inode, nlink);
+ ino = key->i_ino;
} else {
/* Lower hardlink that will be broken on copy up */
inode = new_inode(sb);
if (!inode)
goto out_nomem;
}
- ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
ovl_inode_init(inode, upperdentry, lowerdentry);
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
+ if (index)
+ ovl_set_flag(OVL_INDEX, inode);
+
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
if (((upperdentry && lowerdentry) || numlower > 1) ||
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 70fcfcc684cc..2dba29eadde6 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -56,6 +56,15 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
if (s == next)
goto invalid;
}
+ /*
+ * One of the ancestor path elements in an absolute path
+ * lookup in ovl_lookup_layer() could have been opaque and
+ * that will stop further lookup in lower layers (d->stop=true)
+ * But we have found an absolute redirect in decendant path
+ * element and that should force continue lookup in lower
+ * layers (reset d->stop).
+ */
+ d->stop = false;
} else {
if (strchr(buf, '/') != NULL)
goto invalid;
@@ -171,7 +180,8 @@ invalid:
goto out;
}
-struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt)
+struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
+ bool connected)
{
struct dentry *real;
int bytes;
@@ -186,7 +196,7 @@ struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt)
bytes = (fh->len - offsetof(struct ovl_fh, fid));
real = exportfs_decode_fh(mnt, (struct fid *)fh->fid,
bytes >> 2, (int)fh->type,
- ovl_acceptable, mnt);
+ connected ? ovl_acceptable : NULL, mnt);
if (IS_ERR(real)) {
/*
* Treat stale file handle to lower file as "origin unknown".
@@ -220,6 +230,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
{
struct dentry *this;
int err;
+ bool last_element = !post[0];
this = lookup_one_len_unlocked(name, base, namelen);
if (IS_ERR(this)) {
@@ -245,11 +256,23 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
if (d->is_dir)
goto put_and_out;
+
+ /*
+ * NB: handle failure to lookup non-last element when non-dir
+ * redirects become possible
+ */
+ WARN_ON(!last_element);
goto out;
}
- d->is_dir = true;
- if (!d->last && ovl_is_opaquedir(this)) {
- d->stop = d->opaque = true;
+ if (last_element)
+ d->is_dir = true;
+ if (d->last)
+ goto out;
+
+ if (ovl_is_opaquedir(this)) {
+ d->stop = true;
+ if (last_element)
+ d->opaque = true;
goto out;
}
err = ovl_check_redirect(this, d, prelen, post);
@@ -310,14 +333,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
}
-int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
+int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *upperdentry, struct ovl_path **stackp)
{
struct dentry *origin = NULL;
int i;
for (i = 0; i < ofs->numlower; i++) {
- origin = ovl_decode_fh(fh, ofs->lower_layers[i].mnt);
+ origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt,
+ connected);
if (origin)
break;
}
@@ -361,7 +385,7 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
if (IS_ERR_OR_NULL(fh))
return PTR_ERR(fh);
- err = ovl_check_origin_fh(ofs, fh, upperdentry, stackp);
+ err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp);
kfree(fh);
if (err) {
@@ -415,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name,
struct ovl_fh *fh;
int err;
- fh = ovl_encode_fh(real, is_upper);
+ fh = ovl_encode_real_fh(real, is_upper);
err = PTR_ERR(fh);
if (IS_ERR(fh))
goto fail;
@@ -451,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
- upper = ovl_decode_fh(fh, ofs->upper_mnt);
+ upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
kfree(fh);
if (IS_ERR_OR_NULL(upper))
@@ -558,7 +582,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
/* Check if non-dir index is orphan and don't warn before cleaning it */
if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) {
- err = ovl_check_origin_fh(ofs, fh, index, &stack);
+ err = ovl_check_origin_fh(ofs, fh, false, index, &stack);
if (err)
goto fail;
@@ -619,7 +643,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name)
struct ovl_fh *fh;
int err;
- fh = ovl_encode_fh(origin, false);
+ fh = ovl_encode_real_fh(origin, false);
if (IS_ERR(fh))
return PTR_ERR(fh);
@@ -815,7 +839,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.is_dir = false,
.opaque = false,
.stop = false,
- .last = !poe->numlower,
+ .last = ofs->config.redirect_follow ? false : !poe->numlower,
.redirect = NULL,
};
@@ -873,7 +897,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
for (i = 0; !d.stop && i < poe->numlower; i++) {
struct ovl_path lower = poe->lowerstack[i];
- d.last = i == poe->numlower - 1;
+ if (!ofs->config.redirect_follow)
+ d.last = i == poe->numlower - 1;
+ else
+ d.last = lower.layer->idx == roe->numlower;
+
err = ovl_lookup_layer(lower.dentry, &d, &this);
if (err)
goto out_put;
@@ -976,17 +1004,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperdentry = dget(index);
if (upperdentry || ctr) {
- if (ctr)
- origin = stack[0].dentry;
- inode = ovl_get_inode(dentry->d_sb, upperdentry, origin, index,
+ inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index,
ctr);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_free_oe;
+ /*
+ * NB: handle redirected hard links when non-dir redirects
+ * become possible
+ */
+ WARN_ON(OVL_I(inode)->redirect);
OVL_I(inode)->redirect = upperredirect;
- if (index)
- ovl_set_flag(OVL_INDEX, inode);
}
revert_creds(old_cred);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 225ff1171147..e0b7de799f6b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -202,7 +202,7 @@ void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
struct super_block *ovl_same_sb(struct super_block *sb);
-bool ovl_can_decode_fh(struct super_block *sb);
+int ovl_can_decode_fh(struct super_block *sb);
struct dentry *ovl_indexdir(struct super_block *sb);
bool ovl_index_all(struct super_block *sb);
bool ovl_verify_lower(struct super_block *sb);
@@ -215,6 +215,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_i_dentry_upper(struct inode *inode);
struct inode *ovl_inode_upper(struct inode *inode);
@@ -263,11 +264,19 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
}
+static inline unsigned int ovl_xino_bits(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->xino_bits;
+}
+
/* namei.c */
int ovl_check_fh_len(struct ovl_fh *fh, int fh_len);
-struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt);
-int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
+struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
+ bool connected);
+int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *upperdentry, struct ovl_path **stackp);
int ovl_verify_set_fh(struct dentry *dentry, const char *name,
struct dentry *real, bool is_upper, bool set);
@@ -329,7 +338,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
- struct dentry *lowerdentry, struct dentry *index,
+ struct ovl_path *lowerpath, struct dentry *index,
unsigned int numlower);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
@@ -361,7 +370,7 @@ int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_flags(struct dentry *dentry, int flags);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
-struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper);
+struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
struct dentry *upper);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index bfef6edcc111..41655a7d6894 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -18,13 +18,21 @@ struct ovl_config {
const char *redirect_mode;
bool index;
bool nfs_export;
+ int xino;
+};
+
+struct ovl_sb {
+ struct super_block *sb;
+ dev_t pseudo_dev;
};
struct ovl_layer {
struct vfsmount *mnt;
- dev_t pseudo_dev;
- /* Index of this layer in fs root (upper == 0) */
+ struct ovl_sb *fs;
+ /* Index of this layer in fs root (upper idx == 0) */
int idx;
+ /* One fsid per unique underlying sb (upper fsid == 0) */
+ int fsid;
};
struct ovl_path {
@@ -35,8 +43,11 @@ struct ovl_path {
/* private information held for overlayfs's superblock */
struct ovl_fs {
struct vfsmount *upper_mnt;
- unsigned numlower;
+ unsigned int numlower;
+ /* Number of unique lower sb that differ from upper sb */
+ unsigned int numlowerfs;
struct ovl_layer *lower_layers;
+ struct ovl_sb *lower_fs;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
/* workdir is the 'work' directory under workbasedir */
@@ -50,11 +61,11 @@ struct ovl_fs {
const struct cred *creator_cred;
bool tmpfile;
bool noxattr;
- /* sb common to all layers */
- struct super_block *same_sb;
/* Did we take the inuse lock? */
bool upperdir_locked;
bool workdir_locked;
+ /* Inode numbers in all layers do not use the high xino_bits */
+ unsigned int xino_bits;
};
/* private information held for every overlayfs dentry */
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c11f5c0906c3..ef1fe42ff7bb 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -120,6 +120,10 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
if (!rdd->dentry)
return false;
+ /* Always recalc d_ino when remapping lower inode numbers */
+ if (ovl_xino_bits(rdd->dentry->d_sb))
+ return true;
+
/* Always recalc d_ino for parent */
if (strcmp(p->name, "..") == 0)
return true;
@@ -435,6 +439,19 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
return cache;
}
+/* Map inode number to lower fs unique range */
+static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
+ const char *name, int namelen)
+{
+ if (ino >> (64 - xinobits)) {
+ pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+ namelen, name, ino, xinobits);
+ return ino;
+ }
+
+ return ino | ((u64)fsid) << (64 - xinobits);
+}
+
/*
* Set d_ino for upper entries. Non-upper entries should always report
* the uppermost real inode ino and should not call this function.
@@ -452,9 +469,10 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
struct dentry *this = NULL;
enum ovl_path_type type;
u64 ino = p->real_ino;
+ int xinobits = ovl_xino_bits(dir->d_sb);
int err = 0;
- if (!ovl_same_sb(dir->d_sb))
+ if (!ovl_same_sb(dir->d_sb) && !xinobits)
goto out;
if (p->name[0] == '.') {
@@ -491,6 +509,10 @@ get:
WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
ino = stat.ino;
+ } else if (xinobits && !OVL_TYPE_UPPER(type)) {
+ ino = ovl_remap_lower_ino(ino, xinobits,
+ ovl_layer_lower(this)->fsid,
+ p->name, p->len);
}
out:
@@ -618,6 +640,8 @@ struct ovl_readdir_translate {
struct ovl_dir_cache *cache;
struct dir_context ctx;
u64 parent_ino;
+ int fsid;
+ int xinobits;
};
static int ovl_fill_real(struct dir_context *ctx, const char *name,
@@ -628,14 +652,17 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name,
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
- if (rdt->parent_ino && strcmp(name, "..") == 0)
+ if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
- else if (rdt->cache) {
+ } else if (rdt->cache) {
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdt->cache->root, name, namelen);
if (p)
ino = p->ino;
+ } else if (rdt->xinobits) {
+ ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
+ name, namelen);
}
return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
@@ -646,11 +673,16 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
int err;
struct ovl_dir_file *od = file->private_data;
struct dentry *dir = file->f_path.dentry;
+ struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
.orig_ctx = ctx,
+ .xinobits = ovl_xino_bits(dir->d_sb),
};
+ if (rdt.xinobits && lower_layer)
+ rdt.fsid = lower_layer->fsid;
+
if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) {
struct kstat stat;
struct path statpath = file->f_path;
@@ -693,9 +725,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
* dir is impure then need to adjust d_ino for copied up
* entries.
*/
- if (ovl_same_sb(dentry->d_sb) &&
- (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) ||
- OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) {
+ if (ovl_xino_bits(dentry->d_sb) ||
+ (ovl_same_sb(dentry->d_sb) &&
+ (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) ||
+ OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
return ovl_iterate_real(file, ctx);
}
return iterate_dir(od->realfile, ctx);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7c24619ae7fc..e8551c97de51 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -17,6 +17,7 @@
#include <linux/statfs.h>
#include <linux/seq_file.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -50,6 +51,11 @@ module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644);
MODULE_PARM_DESC(ovl_nfs_export_def,
"Default to on or off for the NFS export feature");
+static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO);
+module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644);
+MODULE_PARM_DESC(ovl_xino_auto_def,
+ "Auto enable xino feature");
+
static void ovl_entry_stack_free(struct ovl_entry *oe)
{
unsigned int i;
@@ -236,11 +242,12 @@ static void ovl_free_fs(struct ovl_fs *ofs)
if (ofs->upperdir_locked)
ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
mntput(ofs->upper_mnt);
- for (i = 0; i < ofs->numlower; i++) {
+ for (i = 0; i < ofs->numlower; i++)
mntput(ofs->lower_layers[i].mnt);
- free_anon_bdev(ofs->lower_layers[i].pseudo_dev);
- }
+ for (i = 0; i < ofs->numlowerfs; i++)
+ free_anon_bdev(ofs->lower_fs[i].pseudo_dev);
kfree(ofs->lower_layers);
+ kfree(ofs->lower_fs);
kfree(ofs->config.lowerdir);
kfree(ofs->config.upperdir);
@@ -325,6 +332,23 @@ static const char *ovl_redirect_mode_def(void)
return ovl_redirect_dir_def ? "on" : "off";
}
+enum {
+ OVL_XINO_OFF,
+ OVL_XINO_AUTO,
+ OVL_XINO_ON,
+};
+
+static const char * const ovl_xino_str[] = {
+ "off",
+ "auto",
+ "on",
+};
+
+static inline int ovl_xino_def(void)
+{
+ return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF;
+}
+
/**
* ovl_show_options
*
@@ -350,6 +374,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
if (ofs->config.nfs_export != ovl_nfs_export_def)
seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
"on" : "off");
+ if (ofs->config.xino != ovl_xino_def())
+ seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]);
return 0;
}
@@ -384,6 +410,9 @@ enum {
OPT_INDEX_OFF,
OPT_NFS_EXPORT_ON,
OPT_NFS_EXPORT_OFF,
+ OPT_XINO_ON,
+ OPT_XINO_OFF,
+ OPT_XINO_AUTO,
OPT_ERR,
};
@@ -397,6 +426,9 @@ static const match_table_t ovl_tokens = {
{OPT_INDEX_OFF, "index=off"},
{OPT_NFS_EXPORT_ON, "nfs_export=on"},
{OPT_NFS_EXPORT_OFF, "nfs_export=off"},
+ {OPT_XINO_ON, "xino=on"},
+ {OPT_XINO_OFF, "xino=off"},
+ {OPT_XINO_AUTO, "xino=auto"},
{OPT_ERR, NULL}
};
@@ -511,6 +543,18 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->nfs_export = false;
break;
+ case OPT_XINO_ON:
+ config->xino = OVL_XINO_ON;
+ break;
+
+ case OPT_XINO_OFF:
+ config->xino = OVL_XINO_OFF;
+ break;
+
+ case OPT_XINO_AUTO:
+ config->xino = OVL_XINO_AUTO;
+ break;
+
default:
pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
return -EINVAL;
@@ -700,6 +744,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
static int ovl_lower_dir(const char *name, struct path *path,
struct ovl_fs *ofs, int *stack_depth, bool *remote)
{
+ int fh_type;
int err;
err = ovl_mount_dir_noesc(name, path);
@@ -719,15 +764,19 @@ static int ovl_lower_dir(const char *name, struct path *path,
* The inodes index feature and NFS export need to encode and decode
* file handles, so they require that all layers support them.
*/
+ fh_type = ovl_can_decode_fh(path->dentry->d_sb);
if ((ofs->config.nfs_export ||
- (ofs->config.index && ofs->config.upperdir)) &&
- !ovl_can_decode_fh(path->dentry->d_sb)) {
+ (ofs->config.index && ofs->config.upperdir)) && !fh_type) {
ofs->config.index = false;
ofs->config.nfs_export = false;
pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
name);
}
+ /* Check if lower fs has 32bit inode numbers */
+ if (fh_type != FILEID_INO32_GEN)
+ ofs->xino_bits = 0;
+
return 0;
out_put:
@@ -951,6 +1000,7 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
{
struct vfsmount *mnt = ofs->upper_mnt;
struct dentry *temp;
+ int fh_type;
int err;
err = mnt_want_write(mnt);
@@ -1000,12 +1050,16 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
}
/* Check if upper/work fs supports file handles */
- if (ofs->config.index &&
- !ovl_can_decode_fh(ofs->workdir->d_sb)) {
+ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
+ if (ofs->config.index && !fh_type) {
ofs->config.index = false;
pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
}
+ /* Check if upper fs has 32bit inode numbers */
+ if (fh_type != FILEID_INO32_GEN)
+ ofs->xino_bits = 0;
+
/* NFS export of r/w mount depends on index */
if (ofs->config.nfs_export && !ofs->config.index) {
pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n");
@@ -1108,6 +1162,35 @@ out:
return err;
}
+/* Get a unique fsid for the layer */
+static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb)
+{
+ unsigned int i;
+ dev_t dev;
+ int err;
+
+ /* fsid 0 is reserved for upper fs even with non upper overlay */
+ if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb)
+ return 0;
+
+ for (i = 0; i < ofs->numlowerfs; i++) {
+ if (ofs->lower_fs[i].sb == sb)
+ return i + 1;
+ }
+
+ err = get_anon_bdev(&dev);
+ if (err) {
+ pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
+ return err;
+ }
+
+ ofs->lower_fs[ofs->numlowerfs].sb = sb;
+ ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev;
+ ofs->numlowerfs++;
+
+ return ofs->numlowerfs;
+}
+
static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
unsigned int numlower)
{
@@ -1119,23 +1202,27 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
GFP_KERNEL);
if (ofs->lower_layers == NULL)
goto out;
+
+ ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb),
+ GFP_KERNEL);
+ if (ofs->lower_fs == NULL)
+ goto out;
+
for (i = 0; i < numlower; i++) {
struct vfsmount *mnt;
- dev_t dev;
+ int fsid;
- err = get_anon_bdev(&dev);
- if (err) {
- pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
+ err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb);
+ if (err < 0)
goto out;
- }
mnt = clone_private_mount(&stack[i]);
err = PTR_ERR(mnt);
if (IS_ERR(mnt)) {
pr_err("overlayfs: failed to clone lowerpath\n");
- free_anon_bdev(dev);
goto out;
}
+
/*
* Make lower layers R/O. That way fchmod/fchown on lower file
* will fail instead of modifying lower fs.
@@ -1143,16 +1230,41 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
ofs->lower_layers[ofs->numlower].mnt = mnt;
- ofs->lower_layers[ofs->numlower].pseudo_dev = dev;
ofs->lower_layers[ofs->numlower].idx = i + 1;
+ ofs->lower_layers[ofs->numlower].fsid = fsid;
+ if (fsid) {
+ ofs->lower_layers[ofs->numlower].fs =
+ &ofs->lower_fs[fsid - 1];
+ }
ofs->numlower++;
+ }
+
+ /*
+ * When all layers on same fs, overlay can use real inode numbers.
+ * With mount option "xino=on", mounter declares that there are enough
+ * free high bits in underlying fs to hold the unique fsid.
+ * If overlayfs does encounter underlying inodes using the high xino
+ * bits reserved for fsid, it emits a warning and uses the original
+ * inode number.
+ */
+ if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) {
+ ofs->xino_bits = 0;
+ ofs->config.xino = OVL_XINO_OFF;
+ } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) {
+ /*
+ * This is a roundup of number of bits needed for numlowerfs+1
+ * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for
+ * upper fs even with non upper overlay.
+ */
+ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31);
+ ofs->xino_bits = ilog2(ofs->numlowerfs) + 1;
+ }
- /* Check if all lower layers are on same sb */
- if (i == 0)
- ofs->same_sb = mnt->mnt_sb;
- else if (ofs->same_sb != mnt->mnt_sb)
- ofs->same_sb = NULL;
+ if (ofs->xino_bits) {
+ pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n",
+ ofs->xino_bits);
}
+
err = 0;
out:
return err;
@@ -1263,6 +1375,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ofs->config.index = ovl_index_def;
ofs->config.nfs_export = ovl_nfs_export_def;
+ ofs->config.xino = ovl_xino_def();
err = ovl_parse_opt((char *) data, &ofs->config);
if (err)
goto out_err;
@@ -1276,6 +1389,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ /* Assume underlaying fs uses 32bit inodes unless proven otherwise */
+ if (ofs->config.xino != OVL_XINO_OFF)
+ ofs->xino_bits = BITS_PER_LONG - 32;
+
if (ofs->config.upperdir) {
if (!ofs->config.workdir) {
pr_err("overlayfs: missing 'workdir'\n");
@@ -1305,8 +1422,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ofs->upper_mnt)
sb->s_flags |= SB_RDONLY;
- else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
- ofs->same_sb = NULL;
if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
err = ovl_get_indexdir(ofs, oe, &upperpath);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 930784a26623..6f1078028c66 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -47,13 +47,29 @@ struct super_block *ovl_same_sb(struct super_block *sb)
{
struct ovl_fs *ofs = sb->s_fs_info;
- return ofs->same_sb;
+ if (!ofs->numlowerfs)
+ return ofs->upper_mnt->mnt_sb;
+ else if (ofs->numlowerfs == 1 && !ofs->upper_mnt)
+ return ofs->lower_fs[0].sb;
+ else
+ return NULL;
}
-bool ovl_can_decode_fh(struct super_block *sb)
+/*
+ * Check if underlying fs supports file handles and try to determine encoding
+ * type, in order to deduce maximum inode number used by fs.
+ *
+ * Return 0 if file handles are not supported.
+ * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding.
+ * Return -1 if fs uses a non default encoding with unknown inode size.
+ */
+int ovl_can_decode_fh(struct super_block *sb)
{
- return (sb->s_export_op && sb->s_export_op->fh_to_dentry &&
- !uuid_is_null(&sb->s_uuid));
+ if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry ||
+ uuid_is_null(&sb->s_uuid))
+ return 0;
+
+ return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
}
struct dentry *ovl_indexdir(struct super_block *sb)
@@ -172,6 +188,13 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry)
return oe->numlower ? oe->lowerstack[0].dentry : NULL;
}
+struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->numlower ? oe->lowerstack[0].layer : NULL;
+}
+
struct dentry *ovl_dentry_real(struct dentry *dentry)
{
return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry);
@@ -279,12 +302,16 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
struct dentry *lowerdentry)
{
+ struct inode *realinode = d_inode(upperdentry ?: lowerdentry);
+
if (upperdentry)
OVL_I(inode)->__upperdentry = upperdentry;
if (lowerdentry)
OVL_I(inode)->lower = igrab(d_inode(lowerdentry));
- ovl_copyattr(d_inode(upperdentry ?: lowerdentry), inode);
+ ovl_copyattr(realinode, inode);
+ if (!inode->i_ino)
+ inode->i_ino = realinode->i_ino;
}
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
@@ -299,6 +326,8 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
smp_wmb();
OVL_I(inode)->__upperdentry = upperdentry;
if (inode_unhashed(inode)) {
+ if (!inode->i_ino)
+ inode->i_ino = upperinode->i_ino;
inode->i_private = upperinode;
__insert_inode_hash(inode, (unsigned long) upperinode);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 39d6f431da83..bb0840e234f3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -509,19 +509,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
}
-/* No kernel lock held - fine */
-static __poll_t
-pipe_poll(struct file *filp, poll_table *wait)
+static struct wait_queue_head *
+pipe_get_poll_head(struct file *filp, __poll_t events)
{
- __poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- int nrbufs;
- poll_wait(filp, &pipe->wait, wait);
+ return &pipe->wait;
+}
+
+/* No kernel lock held - fine */
+static __poll_t pipe_poll_mask(struct file *filp, __poll_t events)
+{
+ struct pipe_inode_info *pipe = filp->private_data;
+ int nrbufs = pipe->nrbufs;
+ __poll_t mask = 0;
/* Reading only -- no need for acquiring the semaphore. */
- nrbufs = pipe->nrbufs;
- mask = 0;
if (filp->f_mode & FMODE_READ) {
mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
if (!pipe->writers && filp->f_version != pipe->w_counter)
@@ -1020,7 +1023,8 @@ const struct file_operations pipefifo_fops = {
.llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
- .poll = pipe_poll,
+ .get_poll_head = pipe_get_poll_head,
+ .poll_mask = pipe_poll_mask,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 598803576e4c..004077f1a7bf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -85,6 +85,7 @@
#include <linux/delayacct.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
+#include <linux/prctl.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/string_helpers.h>
@@ -95,22 +96,29 @@
#include <asm/processor.h>
#include "internal.h"
-static inline void task_name(struct seq_file *m, struct task_struct *p)
+void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
char *buf;
size_t size;
- char tcomm[sizeof(p->comm)];
+ char tcomm[64];
int ret;
- get_task_comm(tcomm, p);
-
- seq_puts(m, "Name:\t");
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else
+ __get_task_comm(tcomm, sizeof(tcomm), p);
size = seq_get_buf(m, &buf);
- ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
- seq_commit(m, ret < size ? ret : -1);
+ if (escape) {
+ ret = string_escape_str(tcomm, buf, size,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ if (ret >= size)
+ ret = -1;
+ } else {
+ ret = strscpy(buf, tcomm, size);
+ }
- seq_putc(m, '\n');
+ seq_commit(m, ret);
}
/*
@@ -141,25 +149,12 @@ static inline const char *get_task_state(struct task_struct *tsk)
return task_state_array[task_state_index(tsk)];
}
-static inline int get_task_umask(struct task_struct *tsk)
-{
- struct fs_struct *fs;
- int umask = -ENOENT;
-
- task_lock(tsk);
- fs = tsk->fs;
- if (fs)
- umask = fs->umask;
- task_unlock(tsk);
- return umask;
-}
-
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
- int g, umask;
+ int g, umask = -1;
struct task_struct *tracer;
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
@@ -177,17 +172,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
- umask = get_task_umask(p);
- if (umask >= 0)
- seq_printf(m, "Umask:\t%#04o\n", umask);
-
task_lock(p);
+ if (p->fs)
+ umask = p->fs->umask;
if (p->files)
max_fds = files_fdtable(p->files)->max_fds;
task_unlock(p);
rcu_read_unlock();
- seq_printf(m, "State:\t%s", get_task_state(p));
+ if (umask >= 0)
+ seq_printf(m, "Umask:\t%#04o\n", umask);
+ seq_puts(m, "State:\t");
+ seq_puts(m, get_task_state(p));
seq_put_decimal_ull(m, "\nTgid:\t", tgid);
seq_put_decimal_ull(m, "\nNgid:\t", ngid);
@@ -313,8 +309,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
seq_puts(m, header);
CAP_FOR_EACH_U32(__capi) {
- seq_printf(m, "%08x",
- a->cap[CAP_LAST_U32 - __capi]);
+ seq_put_hex_ll(m, NULL,
+ a->cap[CAP_LAST_U32 - __capi], 8);
}
seq_putc(m, '\n');
}
@@ -347,6 +343,30 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
+ seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
+ case -EINVAL:
+ seq_printf(m, "unknown");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_printf(m, "not vulnerable");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_printf(m, "thread force mitigated");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_printf(m, "thread mitigated");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_printf(m, "thread vulnerable");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_printf(m, "globally mitigated");
+ break;
+ default:
+ seq_printf(m, "vulnerable");
+ break;
+ }
seq_putc(m, '\n');
}
@@ -368,7 +388,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
{
- seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state);
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+ seq_putc(m, '\n');
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -376,7 +397,10 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
{
struct mm_struct *mm = get_task_mm(task);
- task_name(m, task);
+ seq_puts(m, "Name:\t");
+ proc_task_name(m, task, true);
+ seq_putc(m, '\n');
+
task_state(m, ns, pid, task);
if (mm) {
@@ -411,7 +435,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
u64 cutime, cstime, utime, stime;
u64 cgtime, gtime;
unsigned long rsslim = 0;
- char tcomm[sizeof(task->comm)];
unsigned long flags;
state = *get_task_state(task);
@@ -438,8 +461,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
}
- get_task_comm(tcomm, task);
-
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = 0;
@@ -504,7 +525,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(task->real_start_time);
- seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
+ seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+ seq_puts(m, " (");
+ proc_task_name(m, task, false);
+ seq_puts(m, ") ");
+ seq_putc(m, state);
seq_put_decimal_ll(m, " ", ppid);
seq_put_decimal_ll(m, " ", pgid);
seq_put_decimal_ll(m, " ", sid);
@@ -684,25 +709,22 @@ out:
static int children_seq_show(struct seq_file *seq, void *v)
{
- struct inode *inode = seq->private;
- pid_t pid;
-
- pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
- seq_printf(seq, "%d ", pid);
+ struct inode *inode = file_inode(seq->file);
+ seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode)));
return 0;
}
static void *children_seq_start(struct seq_file *seq, loff_t *pos)
{
- return get_children_pid(seq->private, NULL, *pos);
+ return get_children_pid(file_inode(seq->file), NULL, *pos);
}
static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct pid *pid;
- pid = get_children_pid(seq->private, v, *pos + 1);
+ pid = get_children_pid(file_inode(seq->file), v, *pos + 1);
put_pid(v);
++*pos;
@@ -723,17 +745,7 @@ static const struct seq_operations children_seq_ops = {
static int children_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *m;
- int ret;
-
- ret = seq_open(file, &children_seq_ops);
- if (ret)
- return ret;
-
- m = file->private_data;
- m->private = inode;
-
- return ret;
+ return seq_open(file, &children_seq_ops);
}
const struct file_operations proc_tid_children_operations = {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d53246863cfb..af128b374143 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -261,7 +261,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
* Inherently racy -- command line shares address space
* with code and data.
*/
- rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+ rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON);
if (rv <= 0)
goto out_free_page;
@@ -279,7 +279,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
int nr_read;
_count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, 0);
+ nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
if (nr_read < 0)
rv = nr_read;
if (nr_read <= 0)
@@ -325,7 +325,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
bool final;
_count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, 0);
+ nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
if (nr_read < 0)
rv = nr_read;
if (nr_read <= 0)
@@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
unsigned long wchan;
char symname[KSYM_NAME_LEN];
- wchan = get_wchan(task);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto print0;
- if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
- && !lookup_symbol_name(wchan, symname))
- seq_printf(m, "%s", symname);
- else
- seq_putc(m, '0');
+ wchan = get_wchan(task);
+ if (wchan && !lookup_symbol_name(wchan, symname)) {
+ seq_puts(m, symname);
+ return 0;
+ }
+print0:
+ seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -695,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
static int proc_pid_permission(struct inode *inode, int mask)
{
- struct pid_namespace *pid = inode->i_sb->s_fs_info;
+ struct pid_namespace *pid = proc_pid_ns(inode);
struct task_struct *task;
bool has_perms;
@@ -730,13 +733,11 @@ static const struct inode_operations proc_def_inode_operations = {
static int proc_single_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
- struct pid_namespace *ns;
- struct pid *pid;
+ struct pid_namespace *ns = proc_pid_ns(inode);
+ struct pid *pid = proc_pid(inode);
struct task_struct *task;
int ret;
- ns = inode->i_sb->s_fs_info;
- pid = proc_pid(inode);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
@@ -943,7 +944,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
max_len = min_t(size_t, PAGE_SIZE, count);
this_len = min(max_len, this_len);
- retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
+ retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
if (retval <= 0) {
ret = retval;
@@ -1407,7 +1408,7 @@ static const struct file_operations proc_fail_nth_operations = {
static int sched_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
- struct pid_namespace *ns = inode->i_sb->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(inode);
struct task_struct *p;
p = get_proc_task(inode);
@@ -1562,9 +1563,8 @@ static int comm_show(struct seq_file *m, void *v)
if (!p)
return -ESRCH;
- task_lock(p);
- seq_printf(m, "%s\n", p->comm);
- task_unlock(p);
+ proc_task_name(m, p, false);
+ seq_putc(m, '\n');
put_task_struct(p);
@@ -1690,6 +1690,12 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
kuid_t uid;
kgid_t gid;
+ if (unlikely(task->flags & PF_KTHREAD)) {
+ *ruid = GLOBAL_ROOT_UID;
+ *rgid = GLOBAL_ROOT_GID;
+ return;
+ }
+
/* Default to the tasks effective ownership */
rcu_read_lock();
cred = __task_cred(task);
@@ -1773,8 +1779,8 @@ int pid_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct pid_namespace *pid = proc_pid_ns(inode);
struct task_struct *task;
- struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
generic_fillattr(inode, stat);
@@ -1800,15 +1806,22 @@ int pid_getattr(const struct path *path, struct kstat *stat,
/* dentry stuff */
/*
- * Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- *
+ * Set <pid>/... inode ownership (can change due to setuid(), etc.)
+ */
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+ task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
+ inode->i_mode &= ~(S_ISUID | S_ISGID);
+ security_task_to_inode(task, inode);
+}
+
+/*
* Rewrite the inode's ownerships here because the owning task may have
* performed a setuid(), etc.
*
*/
-int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
@@ -1820,10 +1833,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
task = get_proc_task(inode);
if (task) {
- task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
-
- inode->i_mode &= ~(S_ISUID | S_ISGID);
- security_task_to_inode(task, inode);
+ pid_update_inode(task, inode);
put_task_struct(task);
return 1;
}
@@ -1871,8 +1881,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
struct dentry *child, *dir = file->f_path.dentry;
struct qstr qname = QSTR_INIT(name, len);
struct inode *inode;
- unsigned type;
- ino_t ino;
+ unsigned type = DT_UNKNOWN;
+ ino_t ino = 1;
child = d_hash_and_lookup(dir, &qname);
if (!child) {
@@ -1881,22 +1891,23 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
if (IS_ERR(child))
goto end_instantiate;
if (d_in_lookup(child)) {
- int err = instantiate(d_inode(dir), child, task, ptr);
+ struct dentry *res;
+ res = instantiate(child, task, ptr);
d_lookup_done(child);
- if (err < 0) {
- dput(child);
+ if (IS_ERR(res))
goto end_instantiate;
+ if (unlikely(res)) {
+ dput(child);
+ child = res;
}
}
}
inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
+end_instantiate:
dput(child);
return dir_emit(ctx, name, len, ino, type);
-
-end_instantiate:
- return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
/*
@@ -1910,6 +1921,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
unsigned long long sval, eval;
unsigned int len;
+ if (str[0] == '0' && str[1] != '-')
+ return -EINVAL;
len = _parse_integer(str, 16, &sval);
if (len & KSTRTOX_OVERFLOW)
return -EINVAL;
@@ -1921,6 +1934,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
return -EINVAL;
str++;
+ if (str[0] == '0' && str[1])
+ return -EINVAL;
len = _parse_integer(str, 16, &eval);
if (len & KSTRTOX_OVERFLOW)
return -EINVAL;
@@ -2054,19 +2069,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = {
.setattr = proc_setattr,
};
-static int
-proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+static struct dentry *
+proc_map_files_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
fmode_t mode = (fmode_t)(unsigned long)ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
((mode & FMODE_READ ) ? S_IRUSR : 0) |
((mode & FMODE_WRITE) ? S_IWUSR : 0));
if (!inode)
- return -ENOENT;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
ei->op.proc_get_link = map_files_get_link;
@@ -2075,9 +2090,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
inode->i_size = 64;
d_set_d_op(dentry, &tid_map_files_dentry_operations);
- d_add(dentry, inode);
-
- return 0;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2086,19 +2099,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
struct task_struct *task;
- int result;
+ struct dentry *result;
struct mm_struct *mm;
- result = -ENOENT;
+ result = ERR_PTR(-ENOENT);
task = get_proc_task(dir);
if (!task)
goto out;
- result = -EACCES;
+ result = ERR_PTR(-EACCES);
if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
- result = -ENOENT;
+ result = ERR_PTR(-ENOENT);
if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
goto out_put_task;
@@ -2112,7 +2125,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
goto out_no_vma;
if (vma->vm_file)
- result = proc_map_files_instantiate(dir, dentry, task,
+ result = proc_map_files_instantiate(dentry, task,
(void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
@@ -2121,7 +2134,7 @@ out_no_vma:
out_put_task:
put_task_struct(task);
out:
- return ERR_PTR(result);
+ return result;
}
static const struct inode_operations proc_map_files_inode_operations = {
@@ -2204,6 +2217,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
}
}
up_read(&mm->mmap_sem);
+ mmput(mm);
for (i = 0; i < nr_files; i++) {
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
@@ -2221,7 +2235,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
}
if (fa)
flex_array_free(fa);
- mmput(mm);
out_put_task:
put_task_struct(task);
@@ -2324,7 +2337,7 @@ static int proc_timers_open(struct inode *inode, struct file *file)
return -ENOMEM;
tp->pid = proc_pid(inode);
- tp->ns = inode->i_sb->s_fs_info;
+ tp->ns = proc_pid_ns(inode);
return 0;
}
@@ -2422,16 +2435,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
.release = single_release,
};
-static int proc_pident_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_pident_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
+ inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
if (S_ISDIR(inode->i_mode))
@@ -2441,13 +2454,9 @@ static int proc_pident_instantiate(struct inode *dir,
if (p->fop)
inode->i_fop = p->fop;
ei->op = p->op;
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2455,11 +2464,9 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
const struct pid_entry *ents,
unsigned int nents)
{
- int error;
struct task_struct *task = get_proc_task(dir);
const struct pid_entry *p, *last;
-
- error = -ENOENT;
+ struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
@@ -2478,11 +2485,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
if (p >= last)
goto out;
- error = proc_pident_instantiate(dir, dentry, task, p);
+ res = proc_pident_instantiate(dentry, task, p);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(error);
+ return res;
}
static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
@@ -3125,38 +3132,32 @@ void proc_flush_task(struct task_struct *task)
}
}
-static int proc_pid_instantiate(struct inode *dir,
- struct dentry * dentry,
+static struct dentry *proc_pid_instantiate(struct dentry * dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
set_nlink(inode, nlink_tgid);
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
-
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- int result = -ENOENT;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
tgid = name_to_int(&dentry->d_name);
if (tgid == ~0U)
@@ -3171,10 +3172,10 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
if (!task)
goto out;
- result = proc_pid_instantiate(dir, dentry, task, NULL);
+ result = proc_pid_instantiate(dentry, task, NULL);
put_task_struct(task);
out:
- return ERR_PTR(result);
+ return result;
}
/*
@@ -3226,7 +3227,7 @@ retry:
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
- struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(file_inode(file));
loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -3422,37 +3423,32 @@ static const struct inode_operations proc_tid_base_inode_operations = {
.setattr = proc_setattr,
};
-static int proc_task_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_task_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
-
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
+
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
- inode->i_flags|=S_IMMUTABLE;
+ inode->i_flags |= S_IMMUTABLE;
set_nlink(inode, nlink_tid);
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
-
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- int result = -ENOENT;
struct task_struct *task;
struct task_struct *leader = get_proc_task(dir);
unsigned tid;
struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
if (!leader)
goto out_no_task;
@@ -3472,13 +3468,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
if (!same_thread_group(leader, task))
goto out_drop_task;
- result = proc_task_instantiate(dir, dentry, task, NULL);
+ result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
put_task_struct(task);
out:
put_task_struct(leader);
out_no_task:
- return ERR_PTR(result);
+ return result;
}
/*
@@ -3575,7 +3571,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = inode->i_sb->s_fs_info;
+ ns = proc_pid_ns(inode);
tid = (int)file->f_version;
file->f_version = 0;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 403cbb12a6e9..fa762c5fbcb2 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -6,25 +6,14 @@
static int cmdline_proc_show(struct seq_file *m, void *v)
{
- seq_printf(m, "%s\n", saved_command_line);
+ seq_puts(m, saved_command_line);
+ seq_putc(m, '\n');
return 0;
}
-static int cmdline_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, cmdline_proc_show, NULL);
-}
-
-static const struct file_operations cmdline_proc_fops = {
- .open = cmdline_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_cmdline_init(void)
{
- proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+ proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index a8ac48aebd59..954caf0b7fee 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -91,21 +91,9 @@ static const struct seq_operations consoles_op = {
.show = show_console_dev
};
-static int consoles_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &consoles_op);
-}
-
-static const struct file_operations proc_consoles_operations = {
- .open = consoles_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_consoles_init(void)
{
- proc_create("consoles", 0, NULL, &proc_consoles_operations);
+ proc_create_seq("consoles", 0, NULL, &consoles_op);
return 0;
}
fs_initcall(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 2c7f22b14489..37d38697eaf8 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -51,21 +51,9 @@ static const struct seq_operations devinfo_ops = {
.show = devinfo_show
};
-static int devinfo_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &devinfo_ops);
-}
-
-static const struct file_operations proc_devinfo_operations = {
- .open = devinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_devices_init(void)
{
- proc_create("devices", 0, NULL, &proc_devinfo_operations);
+ proc_create_seq("devices", 0, NULL, &devinfo_ops);
return 0;
}
fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6b80cd1e419a..05b9893e9a22 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -81,9 +81,41 @@ static const struct file_operations proc_fdinfo_file_operations = {
.release = single_release,
};
+static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
+{
+ struct files_struct *files = get_files_struct(task);
+ struct file *file;
+
+ if (!files)
+ return false;
+
+ rcu_read_lock();
+ file = fcheck_files(files, fd);
+ if (file)
+ *mode = file->f_mode;
+ rcu_read_unlock();
+ put_files_struct(files);
+ return !!file;
+}
+
+static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
+ fmode_t f_mode)
+{
+ task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
+ }
+ security_task_to_inode(task, inode);
+}
+
static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct files_struct *files;
struct task_struct *task;
struct inode *inode;
unsigned int fd;
@@ -96,35 +128,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
fd = proc_fd(inode);
if (task) {
- files = get_files_struct(task);
- if (files) {
- struct file *file;
-
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file) {
- unsigned f_mode = file->f_mode;
-
- rcu_read_unlock();
- put_files_struct(files);
-
- task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-
- if (S_ISLNK(inode->i_mode)) {
- unsigned i_mode = S_IFLNK;
- if (f_mode & FMODE_READ)
- i_mode |= S_IRUSR | S_IXUSR;
- if (f_mode & FMODE_WRITE)
- i_mode |= S_IWUSR | S_IXUSR;
- inode->i_mode = i_mode;
- }
-
- security_task_to_inode(task, inode);
- put_task_struct(task);
- return 1;
- }
- rcu_read_unlock();
- put_files_struct(files);
+ fmode_t f_mode;
+ if (tid_fd_mode(task, fd, &f_mode)) {
+ tid_fd_update_inode(task, inode, f_mode);
+ put_task_struct(task);
+ return 1;
}
put_task_struct(task);
}
@@ -166,34 +174,33 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
return ret;
}
-static int
-proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
- struct task_struct *task, const void *ptr)
+struct fd_data {
+ fmode_t mode;
+ unsigned fd;
+};
+
+static struct dentry *proc_fd_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
- unsigned fd = (unsigned long)ptr;
+ const struct fd_data *data = ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
- ei->fd = fd;
+ ei->fd = data->fd;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
+ tid_fd_update_inode(task, inode, data->mode);
d_set_d_op(dentry, &tid_fd_dentry_operations);
- d_add(dentry, inode);
-
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, 0))
- return 0;
- out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -201,19 +208,21 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
instantiate_t instantiate)
{
struct task_struct *task = get_proc_task(dir);
- int result = -ENOENT;
- unsigned fd = name_to_int(&dentry->d_name);
+ struct fd_data data = {.fd = name_to_int(&dentry->d_name)};
+ struct dentry *result = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
- if (fd == ~0U)
+ if (data.fd == ~0U)
+ goto out;
+ if (!tid_fd_mode(task, data.fd, &data.mode))
goto out;
- result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+ result = instantiate(dentry, task, &data);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(result);
+ return result;
}
static int proc_readfd_common(struct file *file, struct dir_context *ctx,
@@ -236,17 +245,22 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
for (fd = ctx->pos - 2;
fd < files_fdtable(files)->max_fds;
fd++, ctx->pos++) {
+ struct file *f;
+ struct fd_data data;
char name[10 + 1];
int len;
- if (!fcheck_files(files, fd))
+ f = fcheck_files(files, fd);
+ if (!f)
continue;
+ data.mode = f->f_mode;
rcu_read_unlock();
+ data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
- (void *)(unsigned long)fd))
+ &data))
goto out_fd_loop;
cond_resched();
rcu_read_lock();
@@ -304,31 +318,25 @@ const struct inode_operations proc_fd_inode_operations = {
.setattr = proc_setattr,
};
-static int
-proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
- struct task_struct *task, const void *ptr)
+static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
- unsigned fd = (unsigned long)ptr;
+ const struct fd_data *data = ptr;
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
- ei->fd = fd;
+ ei->fd = data->fd;
inode->i_fop = &proc_fdinfo_file_operations;
+ tid_fd_update_inode(task, inode, 0);
d_set_d_op(dentry, &tid_fd_dentry_operations);
- d_add(dentry, inode);
-
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, 0))
- return 0;
- out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static struct dentry *
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5d709fa8f3a2..7b4d9714f248 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -8,12 +8,14 @@
* Copyright (C) 1997 Theodore Ts'o
*/
+#include <linux/cache.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include <linux/mount.h>
@@ -23,11 +25,23 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/uaccess.h>
+#include <linux/seq_file.h>
#include "internal.h"
static DEFINE_RWLOCK(proc_subdir_lock);
+struct kmem_cache *proc_dir_entry_cache __ro_after_init;
+
+void pde_free(struct proc_dir_entry *pde)
+{
+ if (S_ISLNK(pde->mode))
+ kfree(pde->data);
+ if (pde->name != pde->inline_name)
+ kfree(pde->name);
+ kmem_cache_free(proc_dir_entry_cache, pde);
+}
+
static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
{
if (len < de->namelen)
@@ -40,8 +54,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int
static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
- return rb_entry_safe(rb_first_cached(&dir->subdir),
- struct proc_dir_entry, subdir_node);
+ return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
+ subdir_node);
}
static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
@@ -54,7 +68,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
const char *name,
unsigned int len)
{
- struct rb_node *node = dir->subdir.rb_root.rb_node;
+ struct rb_node *node = dir->subdir.rb_node;
while (node) {
struct proc_dir_entry *de = rb_entry(node,
@@ -75,9 +89,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
static bool pde_subdir_insert(struct proc_dir_entry *dir,
struct proc_dir_entry *de)
{
- struct rb_root_cached *root = &dir->subdir;
- struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
- bool leftmost = true;
+ struct rb_root *root = &dir->subdir;
+ struct rb_node **new = &root->rb_node, *parent = NULL;
/* Figure out where to put new node */
while (*new) {
@@ -89,16 +102,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
parent = *new;
if (result < 0)
new = &(*new)->rb_left;
- else if (result > 0) {
+ else if (result > 0)
new = &(*new)->rb_right;
- leftmost = false;
- } else
+ else
return false;
}
/* Add new node and rebalance tree. */
rb_link_node(&de->subdir_node, parent, new);
- rb_insert_color_cached(&de->subdir_node, root, leftmost);
+ rb_insert_color(&de->subdir_node, root);
return true;
}
@@ -207,6 +219,26 @@ void proc_free_inum(unsigned int inum)
ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}
+static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0)
+ return 0; /* revalidate */
+ return 1;
+}
+
+static int proc_misc_d_delete(const struct dentry *dentry)
+{
+ return atomic_read(&PDE(d_inode(dentry))->in_use) < 0;
+}
+
+static const struct dentry_operations proc_misc_dentry_ops = {
+ .d_revalidate = proc_misc_d_revalidate,
+ .d_delete = proc_misc_d_delete,
+};
+
/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
@@ -224,9 +256,8 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &simple_dentry_operations);
- d_add(dentry, inode);
- return NULL;
+ d_set_d_op(dentry, &proc_misc_dentry_ops);
+ return d_splice_alias(inode, dentry);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -315,13 +346,12 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
-static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+/* returns the registered entry, or frees dp and returns NULL on failure */
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+ struct proc_dir_entry *dp)
{
- int ret;
-
- ret = proc_alloc_inum(&dp->low_ino);
- if (ret)
- return ret;
+ if (proc_alloc_inum(&dp->low_ino))
+ goto out_free_entry;
write_lock(&proc_subdir_lock);
dp->parent = dir;
@@ -329,12 +359,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
write_unlock(&proc_subdir_lock);
- proc_free_inum(dp->low_ino);
- return -EEXIST;
+ goto out_free_inum;
}
write_unlock(&proc_subdir_lock);
- return 0;
+ return dp;
+out_free_inum:
+ proc_free_inum(dp->low_ino);
+out_free_entry:
+ pde_free(dp);
+ return NULL;
}
static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
@@ -354,6 +388,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
WARN(1, "name len %u\n", qstr.len);
return NULL;
}
+ if (qstr.len == 1 && fn[0] == '.') {
+ WARN(1, "name '.'\n");
+ return NULL;
+ }
+ if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
+ WARN(1, "name '..'\n");
+ return NULL;
+ }
if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
WARN(1, "create '/proc/%s' by hand\n", qstr.name);
return NULL;
@@ -363,16 +405,26 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
return NULL;
}
- ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
+ ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!ent)
goto out;
+ if (qstr.len + 1 <= sizeof(ent->inline_name)) {
+ ent->name = ent->inline_name;
+ } else {
+ ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
+ if (!ent->name) {
+ pde_free(ent);
+ return NULL;
+ }
+ }
+
memcpy(ent->name, fn, qstr.len + 1);
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
- ent->subdir = RB_ROOT_CACHED;
- atomic_set(&ent->count, 1);
+ ent->subdir = RB_ROOT;
+ refcount_set(&ent->refcnt, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
@@ -394,13 +446,9 @@ struct proc_dir_entry *proc_symlink(const char *name,
if (ent->data) {
strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
- if (proc_register(parent, ent) < 0) {
- kfree(ent->data);
- kfree(ent);
- ent = NULL;
- }
+ ent = proc_register(parent, ent);
} else {
- kfree(ent);
+ pde_free(ent);
ent = NULL;
}
}
@@ -422,11 +470,9 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent->proc_fops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
parent->nlink++;
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
+ ent = proc_register(parent, ent);
+ if (!ent)
parent->nlink--;
- ent = NULL;
- }
}
return ent;
}
@@ -457,47 +503,47 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->proc_fops = NULL;
ent->proc_iops = NULL;
parent->nlink++;
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
+ ent = proc_register(parent, ent);
+ if (!ent)
parent->nlink--;
- ent = NULL;
- }
}
return ent;
}
EXPORT_SYMBOL(proc_create_mount_point);
-struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
- struct proc_dir_entry *parent,
- const struct file_operations *proc_fops,
- void *data)
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+ struct proc_dir_entry **parent, void *data)
{
- struct proc_dir_entry *pde;
+ struct proc_dir_entry *p;
+
if ((mode & S_IFMT) == 0)
mode |= S_IFREG;
-
- if (!S_ISREG(mode)) {
- WARN_ON(1); /* use proc_mkdir() */
+ if ((mode & S_IALLUGO) == 0)
+ mode |= S_IRUGO;
+ if (WARN_ON_ONCE(!S_ISREG(mode)))
return NULL;
+
+ p = __proc_create(parent, name, mode, 1);
+ if (p) {
+ p->proc_iops = &proc_file_inode_operations;
+ p->data = data;
}
+ return p;
+}
+
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct file_operations *proc_fops, void *data)
+{
+ struct proc_dir_entry *p;
BUG_ON(proc_fops == NULL);
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO;
- pde = __proc_create(&parent, name, mode, 1);
- if (!pde)
- goto out;
- pde->proc_fops = proc_fops;
- pde->data = data;
- pde->proc_iops = &proc_file_inode_operations;
- if (proc_register(parent, pde) < 0)
- goto out_free;
- return pde;
-out_free:
- kfree(pde);
-out:
- return NULL;
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_fops = proc_fops;
+ return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -509,6 +555,67 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode,
}
EXPORT_SYMBOL(proc_create);
+static int proc_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ if (de->state_size)
+ return seq_open_private(file, de->seq_ops, de->state_size);
+ return seq_open(file, de->seq_ops);
+}
+
+static const struct file_operations proc_seq_fops = {
+ .open = proc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, const struct seq_operations *ops,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_fops = &proc_seq_fops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_seq_private);
+
+static int proc_single_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ return single_open(file, de->single_show, de->data);
+}
+
+static const struct file_operations proc_single_fops = {
+ .open = proc_single_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *), void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_fops = &proc_single_fops;
+ p->single_show = show;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_single_data);
+
void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
de->size = size;
@@ -522,19 +629,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
}
EXPORT_SYMBOL(proc_set_user);
-static void free_proc_entry(struct proc_dir_entry *de)
-{
- proc_free_inum(de->low_ino);
-
- if (S_ISLNK(de->mode))
- kfree(de->data);
- kfree(de);
-}
-
void pde_put(struct proc_dir_entry *pde)
{
- if (atomic_dec_and_test(&pde->count))
- free_proc_entry(pde);
+ if (refcount_dec_and_test(&pde->refcnt)) {
+ proc_free_inum(pde->low_ino);
+ pde_free(pde);
+ }
}
/*
@@ -555,7 +655,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
- rb_erase_cached(&de->subdir_node, &parent->subdir);
+ rb_erase(&de->subdir_node, &parent->subdir);
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -592,13 +692,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
- rb_erase_cached(&root->subdir_node, &parent->subdir);
+ rb_erase(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
- rb_erase_cached(&next->subdir_node, &de->subdir);
+ rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6e8724958116..2cf3b74391ca 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode)
}
static struct kmem_cache *proc_inode_cachep __ro_after_init;
+static struct kmem_cache *pde_opener_cache __ro_after_init;
static struct inode *proc_alloc_inode(struct super_block *sb)
{
@@ -92,7 +93,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-void __init proc_init_inodecache(void)
+void __init proc_init_kmemcache(void)
{
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
@@ -100,6 +101,13 @@ void __init proc_init_inodecache(void)
SLAB_MEM_SPREAD|SLAB_ACCOUNT|
SLAB_PANIC),
init_once);
+ pde_opener_cache =
+ kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
+ SLAB_ACCOUNT|SLAB_PANIC, NULL);
+ proc_dir_entry_cache = kmem_cache_create_usercopy(
+ "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC,
+ offsetof(struct proc_dir_entry, inline_name),
+ sizeof_field(struct proc_dir_entry, inline_name), NULL);
}
static int proc_show_options(struct seq_file *seq, struct dentry *root)
@@ -138,7 +146,7 @@ static void unuse_pde(struct proc_dir_entry *pde)
complete(pde->pde_unload_completion);
}
-/* pde is locked */
+/* pde is locked on entry, unlocked on exit */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
/*
@@ -157,9 +165,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->c = &c;
spin_unlock(&pde->pde_unload_lock);
wait_for_completion(&c);
- spin_lock(&pde->pde_unload_lock);
} else {
struct file *file;
+ struct completion *c;
+
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
@@ -167,9 +176,11 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
spin_lock(&pde->pde_unload_lock);
/* After ->release. */
list_del(&pdeo->lh);
- if (unlikely(pdeo->c))
- complete(pdeo->c);
- kfree(pdeo);
+ c = pdeo->c;
+ spin_unlock(&pde->pde_unload_lock);
+ if (unlikely(c))
+ complete(c);
+ kmem_cache_free(pde_opener_cache, pdeo);
}
}
@@ -188,6 +199,7 @@ void proc_entry_rundown(struct proc_dir_entry *de)
struct pde_opener *pdeo;
pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
close_pdeo(de, pdeo);
+ spin_lock(&de->pde_unload_lock);
}
spin_unlock(&de->pde_unload_lock);
}
@@ -338,31 +350,36 @@ static int proc_reg_open(struct inode *inode, struct file *file)
*
* Save every "struct file" with custom ->release hook.
*/
- pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
- if (!pdeo)
- return -ENOMEM;
-
- if (!use_pde(pde)) {
- kfree(pdeo);
+ if (!use_pde(pde))
return -ENOENT;
- }
- open = pde->proc_fops->open;
+
release = pde->proc_fops->release;
+ if (release) {
+ pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
+ if (!pdeo) {
+ rv = -ENOMEM;
+ goto out_unuse;
+ }
+ }
+ open = pde->proc_fops->open;
if (open)
rv = open(inode, file);
- if (rv == 0 && release) {
- /* To know what to release. */
- pdeo->file = file;
- pdeo->closing = false;
- pdeo->c = NULL;
- spin_lock(&pde->pde_unload_lock);
- list_add(&pdeo->lh, &pde->pde_openers);
- spin_unlock(&pde->pde_unload_lock);
- } else
- kfree(pdeo);
+ if (release) {
+ if (rv == 0) {
+ /* To know what to release. */
+ pdeo->file = file;
+ pdeo->closing = false;
+ pdeo->c = NULL;
+ spin_lock(&pde->pde_unload_lock);
+ list_add(&pdeo->lh, &pde->pde_openers);
+ spin_unlock(&pde->pde_unload_lock);
+ } else
+ kmem_cache_free(pde_opener_cache, pdeo);
+ }
+out_unuse:
unuse_pde(pde);
return rv;
}
@@ -375,7 +392,7 @@ static int proc_reg_release(struct inode *inode, struct file *file)
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
close_pdeo(pde, pdeo);
- break;
+ return 0;
}
}
spin_unlock(&pde->pde_unload_lock);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d697c8ab0a14..93eb1906c28d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
+#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
@@ -36,27 +37,41 @@ struct proc_dir_entry {
* negative -> it's going away RSN
*/
atomic_t in_use;
- atomic_t count; /* use count */
+ refcount_t refcnt;
struct list_head pde_openers; /* who did ->open, but not ->release */
/* protects ->pde_openers and all struct pde_opener instances */
spinlock_t pde_unload_lock;
struct completion *pde_unload_completion;
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
+ union {
+ const struct seq_operations *seq_ops;
+ int (*single_show)(struct seq_file *, void *);
+ };
void *data;
+ unsigned int state_size;
unsigned int low_ino;
nlink_t nlink;
kuid_t uid;
kgid_t gid;
loff_t size;
struct proc_dir_entry *parent;
- struct rb_root_cached subdir;
+ struct rb_root subdir;
struct rb_node subdir_node;
+ char *name;
umode_t mode;
u8 namelen;
- char name[];
+#ifdef CONFIG_64BIT
+#define SIZEOF_PDE_INLINE_NAME (192-155)
+#else
+#define SIZEOF_PDE_INLINE_NAME (128-95)
+#endif
+ char inline_name[SIZEOF_PDE_INLINE_NAME];
} __randomize_layout;
+extern struct kmem_cache *proc_dir_entry_cache;
+void pde_free(struct proc_dir_entry *pde);
+
union proc_op {
int (*proc_get_link)(struct dentry *, struct path *);
int (*proc_show)(struct seq_file *m,
@@ -121,6 +136,8 @@ unsigned name_to_int(const struct qstr *qstr);
*/
extern const struct file_operations proc_tid_children_operations;
+extern void proc_task_name(struct seq_file *m, struct task_struct *p,
+ bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
@@ -137,14 +154,14 @@ extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
-extern int pid_revalidate(struct dentry *, unsigned int);
+extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
-typedef int instantiate_t(struct inode *, struct dentry *,
+typedef struct dentry *instantiate_t(struct dentry *,
struct task_struct *, const void *);
extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
instantiate_t, struct task_struct *, const void *);
@@ -152,6 +169,10 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
/*
* generic.c
*/
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+ struct proc_dir_entry **parent, void *data);
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+ struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
@@ -159,7 +180,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *
static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
{
- atomic_inc(&pde->count);
+ refcount_inc(&pde->refcnt);
return pde;
}
extern void pde_put(struct proc_dir_entry *);
@@ -177,12 +198,12 @@ struct pde_opener {
struct list_head lh;
bool closing;
struct completion *c;
-};
+} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
-extern void proc_init_inodecache(void);
+void proc_init_kmemcache(void);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern int proc_fill_super(struct super_block *, void *data, int flags);
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 6a6bee9c603c..cb0edc7cbf09 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -34,21 +34,9 @@ static const struct seq_operations int_seq_ops = {
.show = show_interrupts
};
-static int interrupts_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &int_seq_ops);
-}
-
-static const struct file_operations proc_interrupts_operations = {
- .open = interrupts_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_interrupts_init(void)
{
- proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+ proc_create_seq("interrupts", 0, NULL, &int_seq_ops);
return 0;
}
fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d1e82761de81..e64ecb9f2720 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -209,25 +209,34 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
{
struct list_head *head = (struct list_head *)arg;
struct kcore_list *ent;
+ struct page *p;
+
+ if (!pfn_valid(pfn))
+ return 1;
+
+ p = pfn_to_page(pfn);
+ if (!memmap_valid_within(pfn, p, page_zone(p)))
+ return 1;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
- ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+ ent->addr = (unsigned long)page_to_virt(p);
ent->size = nr_pages << PAGE_SHIFT;
- /* Sanity check: Can happen in 32bit arch...maybe */
- if (ent->addr < (unsigned long) __va(0))
+ if (!virt_addr_valid(ent->addr))
goto free_out;
/* cut not-mapped area. ....from ppc-32 code. */
if (ULONG_MAX - ent->addr < ent->size)
ent->size = ULONG_MAX - ent->addr;
- /* cut when vmalloc() area is higher than direct-map area */
- if (VMALLOC_START > (unsigned long)__va(0)) {
- if (ent->addr > VMALLOC_START)
- goto free_out;
+ /*
+ * We've already checked virt_addr_valid so we know this address
+ * is a valid pointer, therefore we can check against it to determine
+ * if we need to trim
+ */
+ if (VMALLOC_START > ent->addr) {
if (VMALLOC_START - ent->addr < ent->size)
ent->size = VMALLOC_START - ent->addr;
}
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index a000d7547479..d06694757201 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,25 +24,13 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
- idr_get_cursor(&task_active_pid_ns(current)->idr));
+ idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
return 0;
}
-static int loadavg_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, loadavg_proc_show, NULL);
-}
-
-static const struct file_operations loadavg_proc_fops = {
- .open = loadavg_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_loadavg_init(void)
{
- proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+ proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
return 0;
}
fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6bb20f864259..2fb04846ed11 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
{
- char v[32];
- static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
- int len;
-
- len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
-
- seq_write(m, s, 16);
-
- if (len > 0) {
- if (len < 8)
- seq_write(m, blanks, 8 - len);
-
- seq_write(m, v, len);
- }
+ seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
seq_write(m, " kB\n", 4);
}
@@ -162,21 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int meminfo_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, meminfo_proc_show, NULL);
-}
-
-static const struct file_operations meminfo_proc_fops = {
- .open = meminfo_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_meminfo_init(void)
{
- proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+ proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
return 0;
}
fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 59b17e509f46..dd2b35f78b09 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -87,28 +87,24 @@ static const struct inode_operations proc_ns_link_inode_operations = {
.setattr = proc_setattr,
};
-static int proc_ns_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static struct dentry *proc_ns_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
{
const struct proc_ns_operations *ns_ops = ptr;
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
if (!inode)
- goto out;
+ return ERR_PTR(-ENOENT);
ei = PROC_I(inode);
inode->i_op = &proc_ns_link_inode_operations;
ei->ns_ops = ns_ops;
+ pid_update_inode(task, inode);
d_set_d_op(dentry, &pid_dentry_operations);
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, 0))
- return 0;
-out:
- return -ENOENT;
+ return d_splice_alias(inode, dentry);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -147,12 +143,10 @@ const struct file_operations proc_ns_dir_operations = {
static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
- int error;
struct task_struct *task = get_proc_task(dir);
const struct proc_ns_operations **entry, **last;
unsigned int len = dentry->d_name.len;
-
- error = -ENOENT;
+ struct dentry *res = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
@@ -167,11 +161,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
if (entry == last)
goto out;
- error = proc_ns_instantiate(dir, dentry, task, *entry);
+ res = proc_ns_instantiate(dentry, task, *entry);
out:
put_task_struct(task);
out_no_task:
- return ERR_PTR(error);
+ return res;
}
const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 75634379f82e..3b63be64e436 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -113,21 +113,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = {
.show = nommu_region_list_show
};
-static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &proc_nommu_region_list_seqop);
-}
-
-static const struct file_operations proc_nommu_region_list_operations = {
- .open = proc_nommu_region_list_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_nommu_init(void)
{
- proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
+ proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop);
return 0;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 68c06ae7888c..7d94fa005b0d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,20 +38,20 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
-int seq_open_net(struct inode *ino, struct file *f,
- const struct seq_operations *ops, int size)
+static int seq_open_net(struct inode *inode, struct file *file)
{
- struct net *net;
+ unsigned int state_size = PDE(inode)->state_size;
struct seq_net_private *p;
+ struct net *net;
- BUG_ON(size < sizeof(*p));
+ WARN_ON_ONCE(state_size < sizeof(*p));
- net = get_proc_net(ino);
- if (net == NULL)
+ net = get_proc_net(inode);
+ if (!net)
return -ENXIO;
- p = __seq_open_private(f, ops, size);
- if (p == NULL) {
+ p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
+ if (!p) {
put_net(net);
return -ENOMEM;
}
@@ -60,51 +60,83 @@ int seq_open_net(struct inode *ino, struct file *f,
#endif
return 0;
}
-EXPORT_SYMBOL_GPL(seq_open_net);
-int single_open_net(struct inode *inode, struct file *file,
- int (*show)(struct seq_file *, void *))
+static int seq_release_net(struct inode *ino, struct file *f)
{
- int err;
- struct net *net;
-
- err = -ENXIO;
- net = get_proc_net(inode);
- if (net == NULL)
- goto err_net;
-
- err = single_open(file, show, net);
- if (err < 0)
- goto err_open;
+ struct seq_file *seq = f->private_data;
+ put_net(seq_file_net(seq));
+ seq_release_private(ino, f);
return 0;
+}
-err_open:
- put_net(net);
-err_net:
- return err;
+static const struct file_operations proc_net_seq_fops = {
+ .open = seq_open_net,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, const struct seq_operations *ops,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_fops = &proc_net_seq_fops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ return proc_register(parent, p);
}
-EXPORT_SYMBOL_GPL(single_open_net);
+EXPORT_SYMBOL_GPL(proc_create_net_data);
-int seq_release_net(struct inode *ino, struct file *f)
+static int single_open_net(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
+ struct proc_dir_entry *de = PDE(inode);
+ struct net *net;
+ int err;
- seq = f->private_data;
+ net = get_proc_net(inode);
+ if (!net)
+ return -ENXIO;
- put_net(seq_file_net(seq));
- seq_release_private(ino, f);
- return 0;
+ err = single_open(file, de->single_show, net);
+ if (err)
+ put_net(net);
+ return err;
}
-EXPORT_SYMBOL_GPL(seq_release_net);
-int single_release_net(struct inode *ino, struct file *f)
+static int single_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq = f->private_data;
put_net(seq->private);
return single_release(ino, f);
}
-EXPORT_SYMBOL_GPL(single_release_net);
+
+static const struct file_operations proc_net_single_fops = {
+ .open = single_open_net,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *), void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_fops = &proc_net_single_fops;
+ p->single_show = show;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single);
static struct net *get_proc_task_net(struct inode *dir)
{
@@ -192,15 +224,16 @@ static __net_init int proc_net_ns_init(struct net *net)
int err;
err = -ENOMEM;
- netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
+ netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
goto out;
- netd->subdir = RB_ROOT_CACHED;
+ netd->subdir = RB_ROOT;
netd->data = net;
netd->nlink = 2;
netd->namelen = 3;
netd->parent = &proc_root;
+ netd->name = netd->inline_name;
memcpy(netd->name, "net", 4);
uid = make_kuid(net->user_ns, 0);
@@ -223,7 +256,7 @@ static __net_init int proc_net_ns_init(struct net *net)
return 0;
free_net:
- kfree(netd);
+ pde_free(netd);
out:
return err;
}
@@ -231,7 +264,7 @@ out:
static __net_exit void proc_net_ns_exit(struct net *net)
{
remove_proc_entry("stat", net->proc_net);
- kfree(net->proc_net);
+ pde_free(net->proc_net);
}
static struct pernet_operations __net_initdata proc_net_ns_ops = {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c41ab261397d..4d765e5e91ed 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -554,9 +554,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
if (!inode)
goto out;
- err = NULL;
d_set_d_op(dentry, &proc_sys_dentry_operations);
- d_add(dentry, inode);
+ err = d_splice_alias(inode, dentry);
out:
if (h)
@@ -684,6 +683,7 @@ static bool proc_sys_fill_cache(struct file *file,
if (IS_ERR(child))
return false;
if (d_in_lookup(child)) {
+ struct dentry *res;
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
d_lookup_done(child);
@@ -691,7 +691,16 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
}
d_set_d_op(child, &proc_sys_dentry_operations);
- d_add(child, inode);
+ res = d_splice_alias(inode, child);
+ d_lookup_done(child);
+ if (unlikely(res)) {
+ if (IS_ERR(res)) {
+ dput(child);
+ return false;
+ }
+ dput(child);
+ child = res;
+ }
}
}
inode = d_inode(child);
@@ -707,14 +716,14 @@ static bool proc_sys_link_fill_cache(struct file *file,
struct ctl_table *table)
{
bool ret = true;
+
head = sysctl_head_grab(head);
+ if (IS_ERR(head))
+ return false;
- if (S_ISLNK(table->mode)) {
- /* It is not an error if we can not follow the link ignore it */
- int err = sysctl_follow_link(&head, &table);
- if (err)
- goto out;
- }
+ /* It is not an error if we can not follow the link ignore it */
+ if (sysctl_follow_link(&head, &table))
+ goto out;
ret = proc_sys_fill_cache(file, ctx, head, table);
out:
@@ -1086,7 +1095,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
if ((table->proc_handler == proc_douintvec) ||
(table->proc_handler == proc_douintvec_minmax)) {
if (table->maxlen != sizeof(unsigned int))
- err |= sysctl_err(path, table, "array now allowed");
+ err |= sysctl_err(path, table, "array not allowed");
}
return err;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d0cf1c50bb6c..c69ff191e5d8 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -126,18 +126,6 @@ static const struct seq_operations tty_drivers_op = {
.show = show_tty_driver
};
-static int tty_drivers_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &tty_drivers_op);
-}
-
-static const struct file_operations proc_tty_drivers_operations = {
- .open = tty_drivers_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
/*
* This function is called by tty_register_driver() to handle
* registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -147,11 +135,11 @@ void proc_tty_register_driver(struct tty_driver *driver)
struct proc_dir_entry *ent;
if (!driver->driver_name || driver->proc_entry ||
- !driver->ops->proc_fops)
+ !driver->ops->proc_show)
return;
- ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
- driver->ops->proc_fops, driver);
+ ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver,
+ driver->ops->proc_show, driver);
driver->proc_entry = ent;
}
@@ -186,6 +174,6 @@ void __init proc_tty_init(void)
* entry.
*/
proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
- proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
- proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
+ proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops);
+ proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op);
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ede8e64974be..61b7340b357a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -123,23 +123,13 @@ static struct file_system_type proc_fs_type = {
void __init proc_root_init(void)
{
- int err;
-
- proc_init_inodecache();
+ proc_init_kmemcache();
set_proc_pid_nlink();
- err = register_filesystem(&proc_fs_type);
- if (err)
- return;
-
proc_self_init();
proc_thread_self_init();
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
-
-#ifdef CONFIG_SYSVIPC
- proc_mkdir("sysvipc", NULL);
-#endif
proc_mkdir("fs", NULL);
proc_mkdir("driver", NULL);
proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
@@ -150,6 +140,8 @@ void __init proc_root_init(void)
proc_tty_init();
proc_mkdir("bus", NULL);
proc_sys_init();
+
+ register_filesystem(&proc_fs_type);
}
static int proc_root_getattr(const struct path *path, struct kstat *stat,
@@ -207,12 +199,13 @@ struct proc_dir_entry proc_root = {
.namelen = 5,
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
.nlink = 2,
- .count = ATOMIC_INIT(1),
+ .refcnt = REFCOUNT_INIT(1),
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
- .subdir = RB_ROOT_CACHED,
- .name = "/proc",
+ .subdir = RB_ROOT,
+ .name = proc_root.inline_name,
+ .inline_name = "/proc",
};
int pid_ns_prepare_proc(struct pid_namespace *ns)
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 4d7d061696b3..127265e5c55f 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct pid_namespace *ns = inode->i_sb->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(inode);
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
@@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init;
int proc_setup_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct pid_namespace *ns = s->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
inode_lock(root_inode);
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 24072cc06e65..12901dcf57e2 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -25,21 +25,9 @@ static int show_softirqs(struct seq_file *p, void *v)
return 0;
}
-static int softirqs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_softirqs, NULL);
-}
-
-static const struct file_operations proc_softirqs_operations = {
- .open = softirqs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_softirqs_init(void)
{
- proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+ proc_create_single("softirqs", 0, NULL, show_softirqs);
return 0;
}
fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ec6d2983a5cb..a20c6e495bb2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -24,6 +24,8 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
unsigned long text, lib, swap, anon, file, shmem;
@@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
lib = (mm->exec_vm << PAGE_SHIFT) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
- seq_printf(m,
- "VmPeak:\t%8lu kB\n"
- "VmSize:\t%8lu kB\n"
- "VmLck:\t%8lu kB\n"
- "VmPin:\t%8lu kB\n"
- "VmHWM:\t%8lu kB\n"
- "VmRSS:\t%8lu kB\n"
- "RssAnon:\t%8lu kB\n"
- "RssFile:\t%8lu kB\n"
- "RssShmem:\t%8lu kB\n"
- "VmData:\t%8lu kB\n"
- "VmStk:\t%8lu kB\n"
- "VmExe:\t%8lu kB\n"
- "VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n"
- "VmSwap:\t%8lu kB\n",
- hiwater_vm << (PAGE_SHIFT-10),
- total_vm << (PAGE_SHIFT-10),
- mm->locked_vm << (PAGE_SHIFT-10),
- mm->pinned_vm << (PAGE_SHIFT-10),
- hiwater_rss << (PAGE_SHIFT-10),
- total_rss << (PAGE_SHIFT-10),
- anon << (PAGE_SHIFT-10),
- file << (PAGE_SHIFT-10),
- shmem << (PAGE_SHIFT-10),
- mm->data_vm << (PAGE_SHIFT-10),
- mm->stack_vm << (PAGE_SHIFT-10),
- text >> 10,
- lib >> 10,
- mm_pgtables_bytes(mm) >> 10,
- swap << (PAGE_SHIFT-10));
+ SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
+ SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
+ SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
+ SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
+ SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
+ SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
+ SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
+ SEQ_PUT_DEC(" kB\nRssFile:\t", file);
+ SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
+ SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
+ SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmExe:\t", text >> 10, 8);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmLib:\t", lib >> 10, 8);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
+ SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
+ seq_puts(m, " kB\n");
hugetlb_report_usage(m, mm);
}
+#undef SEQ_PUT_DEC
unsigned long task_vsize(struct mm_struct *mm)
{
@@ -287,15 +278,18 @@ static void show_vma_header_prefix(struct seq_file *m,
dev_t dev, unsigned long ino)
{
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
- start,
- end,
- flags & VM_READ ? 'r' : '-',
- flags & VM_WRITE ? 'w' : '-',
- flags & VM_EXEC ? 'x' : '-',
- flags & VM_MAYSHARE ? 's' : 'p',
- pgoff,
- MAJOR(dev), MINOR(dev), ino);
+ seq_put_hex_ll(m, NULL, start, 8);
+ seq_put_hex_ll(m, "-", end, 8);
+ seq_putc(m, ' ');
+ seq_putc(m, flags & VM_READ ? 'r' : '-');
+ seq_putc(m, flags & VM_WRITE ? 'w' : '-');
+ seq_putc(m, flags & VM_EXEC ? 'x' : '-');
+ seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
+ seq_put_hex_ll(m, " ", pgoff, 8);
+ seq_put_hex_ll(m, " ", MAJOR(dev), 2);
+ seq_put_hex_ll(m, ":", MINOR(dev), 2);
+ seq_put_decimal_ull(m, " ", ino);
+ seq_putc(m, ' ');
}
static void
@@ -694,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
if (!mnemonics[i][0])
continue;
if (vma->vm_flags & (1UL << i)) {
- seq_printf(m, "%c%c ",
- mnemonics[i][0], mnemonics[i][1]);
+ seq_putc(m, mnemonics[i][0]);
+ seq_putc(m, mnemonics[i][1]);
+ seq_putc(m, ' ');
}
}
seq_putc(m, '\n');
@@ -736,6 +731,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
{
}
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct proc_maps_private *priv = m->private;
@@ -809,51 +806,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
ret = SEQ_SKIP;
}
- if (!rollup_mode)
- seq_printf(m,
- "Size: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10);
-
-
- if (!rollup_mode || last_vma)
- seq_printf(m,
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "LazyFree: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
- "Shared_Hugetlb: %8lu kB\n"
- "Private_Hugetlb: %7lu kB\n"
- "Swap: %8lu kB\n"
- "SwapPss: %8lu kB\n"
- "Locked: %8lu kB\n",
- mss->resident >> 10,
- (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
- mss->shared_clean >> 10,
- mss->shared_dirty >> 10,
- mss->private_clean >> 10,
- mss->private_dirty >> 10,
- mss->referenced >> 10,
- mss->anonymous >> 10,
- mss->lazyfree >> 10,
- mss->anonymous_thp >> 10,
- mss->shmem_thp >> 10,
- mss->shared_hugetlb >> 10,
- mss->private_hugetlb >> 10,
- mss->swap >> 10,
- (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
- (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
+ if (!rollup_mode) {
+ SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
+ SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
+ SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
+ seq_puts(m, " kB\n");
+ }
+ if (!rollup_mode || last_vma) {
+ SEQ_PUT_DEC("Rss: ", mss->resident);
+ SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
+ SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
+ SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
+ SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
+ SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
+ SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
+ SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
+ SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+ SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
+ seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
+ mss->private_hugetlb >> 10, 7);
+ SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
+ SEQ_PUT_DEC(" kB\nSwapPss: ",
+ mss->swap_pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT);
+ seq_puts(m, " kB\n");
+ }
if (!rollup_mode) {
arch_show_smap(m, vma);
show_smap_vma_flags(m, vma);
@@ -861,6 +841,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
m_cache_vma(m, vma);
return ret;
}
+#undef SEQ_PUT_DEC
static int show_pid_smap(struct seq_file *m, void *v)
{
@@ -956,7 +937,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
- * Documentation/vm/soft-dirty.txt for full description
+ * Documentation/admin-guide/mm/soft-dirty.rst for full description
* of how soft-dirty works.
*/
pte_t ptent = *pte;
@@ -1329,9 +1310,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
else if (is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
+ unsigned long offset = swp_offset(entry);
+ offset += (addr & ~PMD_MASK) >> PAGE_SHIFT;
frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ (offset << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
@@ -1351,6 +1334,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
break;
if (pm->show_pfn && (flags & PM_PRESENT))
frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
}
spin_unlock(ptl);
return err;
@@ -1436,7 +1421,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
* Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 9d2efaca499f..b905010ca9eb 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct pid_namespace *ns = inode->i_sb->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(inode);
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
@@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init;
int proc_setup_thread_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct pid_namespace *ns = s->s_fs_info;
+ struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
inode_lock(root_inode);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 95a708d83721..3bd12f955867 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -30,21 +30,9 @@ static int uptime_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
- .open = uptime_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_uptime_init(void)
{
- proc_create("uptime", 0, NULL, &uptime_proc_fops);
+ proc_create_single("uptime", 0, NULL, uptime_proc_show);
return 0;
}
fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 94901e8e700d..b449f186577f 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -15,21 +15,9 @@ static int version_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int version_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, version_proc_show, NULL);
-}
-
-static const struct file_operations version_proc_fops = {
- .open = version_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_version_init(void)
{
- proc_create("version", 0, NULL, &version_proc_fops);
+ proc_create_single("version", 0, NULL, version_proc_show);
return 0;
}
fs_initcall(proc_version_init);
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index b42e5bd6d8ff..09c19ef91526 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,5 +1,6 @@
config PSTORE
tristate "Persistent store support"
+ select CRYPTO if PSTORE_COMPRESS
default n
help
This option enables generic access to platform level
@@ -12,35 +13,89 @@ config PSTORE
If you don't have a platform persistent store driver,
say N.
-choice
- prompt "Choose compression algorithm"
- depends on PSTORE
- default PSTORE_ZLIB_COMPRESS
- help
- This option chooses compression algorithm.
-
-config PSTORE_ZLIB_COMPRESS
- bool "ZLIB"
- select ZLIB_DEFLATE
- select ZLIB_INFLATE
- help
- This option enables ZLIB compression algorithm support.
+config PSTORE_DEFLATE_COMPRESS
+ tristate "DEFLATE (ZLIB) compression"
+ default y
+ depends on PSTORE
+ select CRYPTO_DEFLATE
+ help
+ This option enables DEFLATE (also known as ZLIB) compression
+ algorithm support.
config PSTORE_LZO_COMPRESS
- bool "LZO"
- select LZO_COMPRESS
- select LZO_DECOMPRESS
- help
- This option enables LZO compression algorithm support.
+ tristate "LZO compression"
+ depends on PSTORE
+ select CRYPTO_LZO
+ help
+ This option enables LZO compression algorithm support.
config PSTORE_LZ4_COMPRESS
- bool "LZ4"
- select LZ4_COMPRESS
- select LZ4_DECOMPRESS
- help
- This option enables LZ4 compression algorithm support.
+ tristate "LZ4 compression"
+ depends on PSTORE
+ select CRYPTO_LZ4
+ help
+ This option enables LZ4 compression algorithm support.
+
+config PSTORE_LZ4HC_COMPRESS
+ tristate "LZ4HC compression"
+ depends on PSTORE
+ select CRYPTO_LZ4HC
+ help
+ This option enables LZ4HC (high compression) mode algorithm.
+
+config PSTORE_842_COMPRESS
+ bool "842 compression"
+ depends on PSTORE
+ select CRYPTO_842
+ help
+ This option enables 842 compression algorithm support.
+
+config PSTORE_COMPRESS
+ def_bool y
+ depends on PSTORE
+ depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \
+ PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \
+ PSTORE_842_COMPRESS
+
+choice
+ prompt "Default pstore compression algorithm"
+ depends on PSTORE_COMPRESS
+ help
+ This option chooses the default active compression algorithm.
+ This change be changed at boot with "pstore.compress=..." on
+ the kernel command line.
+
+ Currently, pstore has support for 5 compression algorithms:
+ deflate, lzo, lz4, lz4hc and 842.
+
+ The default compression algorithm is deflate.
+
+ config PSTORE_DEFLATE_COMPRESS_DEFAULT
+ bool "deflate" if PSTORE_DEFLATE_COMPRESS
+
+ config PSTORE_LZO_COMPRESS_DEFAULT
+ bool "lzo" if PSTORE_LZO_COMPRESS
+
+ config PSTORE_LZ4_COMPRESS_DEFAULT
+ bool "lz4" if PSTORE_LZ4_COMPRESS
+
+ config PSTORE_LZ4HC_COMPRESS_DEFAULT
+ bool "lz4hc" if PSTORE_LZ4HC_COMPRESS
+
+ config PSTORE_842_COMPRESS_DEFAULT
+ bool "842" if PSTORE_842_COMPRESS
+
endchoice
+config PSTORE_COMPRESS_DEFAULT
+ string
+ depends on PSTORE_COMPRESS
+ default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT
+ default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT
+ default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT
+ default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT
+ default "842" if PSTORE_842_COMPRESS_DEFAULT
+
config PSTORE_CONSOLE
bool "Log kernel console messages"
depends on PSTORE
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d814723fb27d..5fcb845b9fec 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -486,6 +486,8 @@ static int __init init_pstore_fs(void)
{
int err;
+ pstore_choose_compression();
+
/* Create a convenient mount point for people to access pstore */
err = sysfs_create_mount_point(fs_kobj, "pstore");
if (err)
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index c029314478fa..fb767e28aeb2 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -37,4 +37,7 @@ extern bool pstore_is_mounted(void);
extern void pstore_record_init(struct pstore_record *record,
struct pstore_info *psi);
+/* Called during module_init() */
+extern void __init pstore_choose_compression(void);
+
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c3129b131e4d..dc720573fd53 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -28,15 +28,13 @@
#include <linux/console.h>
#include <linux/module.h>
#include <linux/pstore.h>
-#ifdef CONFIG_PSTORE_ZLIB_COMPRESS
-#include <linux/zlib.h>
-#endif
-#ifdef CONFIG_PSTORE_LZO_COMPRESS
+#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
#include <linux/lzo.h>
#endif
-#ifdef CONFIG_PSTORE_LZ4_COMPRESS
+#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
#include <linux/lz4.h>
#endif
+#include <linux/crypto.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/slab.h>
@@ -74,23 +72,18 @@ static DEFINE_SPINLOCK(pstore_lock);
struct pstore_info *psinfo;
static char *backend;
-
-/* Compression parameters */
-#ifdef CONFIG_PSTORE_ZLIB_COMPRESS
-#define COMPR_LEVEL 6
-#define WINDOW_BITS 12
-#define MEM_LEVEL 4
-static struct z_stream_s stream;
+static char *compress =
+#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT
+ CONFIG_PSTORE_COMPRESS_DEFAULT;
#else
-static unsigned char *workspace;
+ NULL;
#endif
-struct pstore_zbackend {
- int (*compress)(const void *in, void *out, size_t inlen, size_t outlen);
- int (*decompress)(void *in, void *out, size_t inlen, size_t outlen);
- void (*allocate)(void);
- void (*free)(void);
+/* Compression parameters */
+static struct crypto_comp *tfm;
+struct pstore_zbackend {
+ int (*zbufsize)(size_t size);
const char *name;
};
@@ -149,77 +142,12 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
}
EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
-#ifdef CONFIG_PSTORE_ZLIB_COMPRESS
-/* Derived from logfs_compress() */
-static int compress_zlib(const void *in, void *out, size_t inlen, size_t outlen)
-{
- int err, ret;
-
- ret = -EIO;
- err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
- MEM_LEVEL, Z_DEFAULT_STRATEGY);
- if (err != Z_OK)
- goto error;
-
- stream.next_in = in;
- stream.avail_in = inlen;
- stream.total_in = 0;
- stream.next_out = out;
- stream.avail_out = outlen;
- stream.total_out = 0;
-
- err = zlib_deflate(&stream, Z_FINISH);
- if (err != Z_STREAM_END)
- goto error;
-
- err = zlib_deflateEnd(&stream);
- if (err != Z_OK)
- goto error;
-
- if (stream.total_out >= stream.total_in)
- goto error;
-
- ret = stream.total_out;
-error:
- return ret;
-}
-
-/* Derived from logfs_uncompress */
-static int decompress_zlib(void *in, void *out, size_t inlen, size_t outlen)
+#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
+static int zbufsize_deflate(size_t size)
{
- int err, ret;
-
- ret = -EIO;
- err = zlib_inflateInit2(&stream, WINDOW_BITS);
- if (err != Z_OK)
- goto error;
-
- stream.next_in = in;
- stream.avail_in = inlen;
- stream.total_in = 0;
- stream.next_out = out;
- stream.avail_out = outlen;
- stream.total_out = 0;
-
- err = zlib_inflate(&stream, Z_FINISH);
- if (err != Z_STREAM_END)
- goto error;
-
- err = zlib_inflateEnd(&stream);
- if (err != Z_OK)
- goto error;
-
- ret = stream.total_out;
-error:
- return ret;
-}
-
-static void allocate_zlib(void)
-{
- size_t size;
size_t cmpr;
- switch (psinfo->bufsize) {
+ switch (size) {
/* buffer range for efivars */
case 1000 ... 2000:
cmpr = 56;
@@ -239,212 +167,131 @@ static void allocate_zlib(void)
break;
}
- big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr;
- big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
- if (big_oops_buf) {
- size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL),
- zlib_inflate_workspacesize());
- stream.workspace = kmalloc(size, GFP_KERNEL);
- if (!stream.workspace) {
- pr_err("No memory for compression workspace; skipping compression\n");
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- }
- } else {
- pr_err("No memory for uncompressed data; skipping compression\n");
- stream.workspace = NULL;
- }
-
+ return (size * 100) / cmpr;
}
-
-static void free_zlib(void)
-{
- kfree(stream.workspace);
- stream.workspace = NULL;
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- big_oops_buf_sz = 0;
-}
-
-static const struct pstore_zbackend backend_zlib = {
- .compress = compress_zlib,
- .decompress = decompress_zlib,
- .allocate = allocate_zlib,
- .free = free_zlib,
- .name = "zlib",
-};
#endif
-#ifdef CONFIG_PSTORE_LZO_COMPRESS
-static int compress_lzo(const void *in, void *out, size_t inlen, size_t outlen)
+#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
+static int zbufsize_lzo(size_t size)
{
- int ret;
-
- ret = lzo1x_1_compress(in, inlen, out, &outlen, workspace);
- if (ret != LZO_E_OK) {
- pr_err("lzo_compress error, ret = %d!\n", ret);
- return -EIO;
- }
-
- return outlen;
+ return lzo1x_worst_compress(size);
}
+#endif
-static int decompress_lzo(void *in, void *out, size_t inlen, size_t outlen)
+#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
+static int zbufsize_lz4(size_t size)
{
- int ret;
-
- ret = lzo1x_decompress_safe(in, inlen, out, &outlen);
- if (ret != LZO_E_OK) {
- pr_err("lzo_decompress error, ret = %d!\n", ret);
- return -EIO;
- }
-
- return outlen;
+ return LZ4_compressBound(size);
}
+#endif
-static void allocate_lzo(void)
+#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
+static int zbufsize_842(size_t size)
{
- big_oops_buf_sz = lzo1x_worst_compress(psinfo->bufsize);
- big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
- if (big_oops_buf) {
- workspace = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
- if (!workspace) {
- pr_err("No memory for compression workspace; skipping compression\n");
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- }
- } else {
- pr_err("No memory for uncompressed data; skipping compression\n");
- workspace = NULL;
- }
+ return size;
}
+#endif
-static void free_lzo(void)
-{
- kfree(workspace);
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- big_oops_buf_sz = 0;
-}
+static const struct pstore_zbackend *zbackend __ro_after_init;
-static const struct pstore_zbackend backend_lzo = {
- .compress = compress_lzo,
- .decompress = decompress_lzo,
- .allocate = allocate_lzo,
- .free = free_lzo,
- .name = "lzo",
-};
+static const struct pstore_zbackend zbackends[] = {
+#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
+ {
+ .zbufsize = zbufsize_deflate,
+ .name = "deflate",
+ },
+#endif
+#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
+ {
+ .zbufsize = zbufsize_lzo,
+ .name = "lzo",
+ },
+#endif
+#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS)
+ {
+ .zbufsize = zbufsize_lz4,
+ .name = "lz4",
+ },
#endif
+#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
+ {
+ .zbufsize = zbufsize_lz4,
+ .name = "lz4hc",
+ },
+#endif
+#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
+ {
+ .zbufsize = zbufsize_842,
+ .name = "842",
+ },
+#endif
+ { }
+};
-#ifdef CONFIG_PSTORE_LZ4_COMPRESS
-static int compress_lz4(const void *in, void *out, size_t inlen, size_t outlen)
+static int pstore_compress(const void *in, void *out,
+ unsigned int inlen, unsigned int outlen)
{
int ret;
- ret = LZ4_compress_default(in, out, inlen, outlen, workspace);
- if (!ret) {
- pr_err("LZ4_compress_default error; compression failed!\n");
- return -EIO;
+ ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
+ if (ret) {
+ pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
+ return ret;
}
- return ret;
+ return outlen;
}
-static int decompress_lz4(void *in, void *out, size_t inlen, size_t outlen)
+static int pstore_decompress(void *in, void *out,
+ unsigned int inlen, unsigned int outlen)
{
int ret;
- ret = LZ4_decompress_safe(in, out, inlen, outlen);
- if (ret < 0) {
- /*
- * LZ4_decompress_safe will return an error code
- * (< 0) if decompression failed
- */
- pr_err("LZ4_decompress_safe error, ret = %d!\n", ret);
- return -EIO;
+ ret = crypto_comp_decompress(tfm, in, inlen, out, &outlen);
+ if (ret) {
+ pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
+ return ret;
}
- return ret;
-}
-
-static void allocate_lz4(void)
-{
- big_oops_buf_sz = LZ4_compressBound(psinfo->bufsize);
- big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
- if (big_oops_buf) {
- workspace = kmalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
- if (!workspace) {
- pr_err("No memory for compression workspace; skipping compression\n");
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- }
- } else {
- pr_err("No memory for uncompressed data; skipping compression\n");
- workspace = NULL;
- }
+ return outlen;
}
-static void free_lz4(void)
+static void allocate_buf_for_compression(void)
{
- kfree(workspace);
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- big_oops_buf_sz = 0;
-}
-
-static const struct pstore_zbackend backend_lz4 = {
- .compress = compress_lz4,
- .decompress = decompress_lz4,
- .allocate = allocate_lz4,
- .free = free_lz4,
- .name = "lz4",
-};
-#endif
-
-static const struct pstore_zbackend *zbackend =
-#if defined(CONFIG_PSTORE_ZLIB_COMPRESS)
- &backend_zlib;
-#elif defined(CONFIG_PSTORE_LZO_COMPRESS)
- &backend_lzo;
-#elif defined(CONFIG_PSTORE_LZ4_COMPRESS)
- &backend_lz4;
-#else
- NULL;
-#endif
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
+ return;
-static int pstore_compress(const void *in, void *out,
- size_t inlen, size_t outlen)
-{
- if (zbackend)
- return zbackend->compress(in, out, inlen, outlen);
- else
- return -EIO;
-}
+ if (!crypto_has_comp(zbackend->name, 0, 0)) {
+ pr_err("No %s compression\n", zbackend->name);
+ return;
+ }
-static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
-{
- if (zbackend)
- return zbackend->decompress(in, out, inlen, outlen);
- else
- return -EIO;
-}
+ big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize);
+ if (big_oops_buf_sz <= 0)
+ return;
-static void allocate_buf_for_compression(void)
-{
- if (zbackend) {
- pr_info("using %s compression\n", zbackend->name);
- zbackend->allocate();
- } else {
+ big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (!big_oops_buf) {
pr_err("allocate compression buffer error!\n");
+ return;
+ }
+
+ tfm = crypto_alloc_comp(zbackend->name, 0, 0);
+ if (IS_ERR_OR_NULL(tfm)) {
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ pr_err("crypto_alloc_comp() failed!\n");
+ return;
}
}
static void free_buf_for_compression(void)
{
- if (zbackend)
- zbackend->free();
- else
- pr_err("free compression buffer error!\n");
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm))
+ crypto_free_comp(tfm);
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ big_oops_buf_sz = 0;
}
/*
@@ -901,5 +748,24 @@ static void pstore_timefunc(struct timer_list *unused)
jiffies + msecs_to_jiffies(pstore_update_ms));
}
+void __init pstore_choose_compression(void)
+{
+ const struct pstore_zbackend *step;
+
+ if (!compress)
+ return;
+
+ for (step = zbackends; step->name; step++) {
+ if (!strcmp(compress, step->name)) {
+ zbackend = step;
+ pr_info("using %s compression\n", zbackend->name);
+ return;
+ }
+ }
+}
+
+module_param(compress, charp, 0444);
+MODULE_PARM_DESC(compress, "Pstore compression to use");
+
module_param(backend, charp, 0444);
MODULE_PARM_DESC(backend, "Pstore backend to use");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 7125b398d312..49b2bc114868 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -938,7 +938,7 @@ static int __init ramoops_init(void)
ramoops_register_dummy();
return platform_driver_register(&ramoops_driver);
}
-postcore_initcall(ramoops_init);
+late_initcall(ramoops_init);
static void __exit ramoops_exit(void)
{
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index e11672aa4575..951a14edcf51 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -98,24 +98,23 @@ static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
uint8_t *data, size_t len, uint8_t *ecc)
{
int i;
- uint16_t par[prz->ecc_info.ecc_size];
/* Initialize the parity buffer */
- memset(par, 0, sizeof(par));
- encode_rs8(prz->rs_decoder, data, len, par, 0);
+ memset(prz->ecc_info.par, 0,
+ prz->ecc_info.ecc_size * sizeof(prz->ecc_info.par[0]));
+ encode_rs8(prz->rs_decoder, data, len, prz->ecc_info.par, 0);
for (i = 0; i < prz->ecc_info.ecc_size; i++)
- ecc[i] = par[i];
+ ecc[i] = prz->ecc_info.par[i];
}
static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz,
void *data, size_t len, uint8_t *ecc)
{
int i;
- uint16_t par[prz->ecc_info.ecc_size];
for (i = 0; i < prz->ecc_info.ecc_size; i++)
- par[i] = ecc[i];
- return decode_rs8(prz->rs_decoder, data, par, len,
+ prz->ecc_info.par[i] = ecc[i];
+ return decode_rs8(prz->rs_decoder, data, prz->ecc_info.par, len,
NULL, 0, NULL, 0, NULL);
}
@@ -228,6 +227,15 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
return -EINVAL;
}
+ /* allocate workspace instead of using stack VLA */
+ prz->ecc_info.par = kmalloc_array(prz->ecc_info.ecc_size,
+ sizeof(*prz->ecc_info.par),
+ GFP_KERNEL);
+ if (!prz->ecc_info.par) {
+ pr_err("cannot allocate ECC parity workspace\n");
+ return -ENOMEM;
+ }
+
prz->corrected_bytes = 0;
prz->bad_blocks = 0;
@@ -514,6 +522,13 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
}
prz->vaddr = NULL;
}
+ if (prz->rs_decoder) {
+ free_rs(prz->rs_decoder);
+ prz->rs_decoder = NULL;
+ }
+ kfree(prz->ecc_info.par);
+ prz->ecc_info.par = NULL;
+
persistent_ram_free_old(prz);
kfree(prz);
}
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index eca27878079d..8d72221735d7 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -114,13 +114,9 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned i
brelse(bh);
foundinode = qnx4_iget(dir->i_sb, ino);
- if (IS_ERR(foundinode)) {
+ if (IS_ERR(foundinode))
QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
PTR_ERR(foundinode)));
- return ERR_CAST(foundinode);
- }
out:
- d_add(dentry, foundinode);
-
- return NULL;
+ return d_splice_alias(foundinode, dentry);
}
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 72c2770830be..e2e98e653b8d 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -29,15 +29,11 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
if (ino) {
foundinode = qnx6_iget(dir->i_sb, ino);
qnx6_put_page(page);
- if (IS_ERR(foundinode)) {
+ if (IS_ERR(foundinode))
pr_debug("lookup->iget -> error %ld\n",
PTR_ERR(foundinode));
- return ERR_CAST(foundinode);
- }
} else {
pr_debug("%s(): not found %s\n", __func__, name);
- return NULL;
}
- d_add(dentry, foundinode);
- return NULL;
+ return d_splice_alias(foundinode, dentry);
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 020c597ef9b6..d88231e3b2be 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2966,7 +2966,7 @@ static int __init dquot_init(void)
NULL);
order = 0;
- dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
+ dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
if (!dquot_hash)
panic("Cannot create dquot hash table");
diff --git a/fs/read_write.c b/fs/read_write.c
index c4eabbfc90df..e83bd9744b5d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2023,7 +2023,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
ret = mnt_want_write_file(dst_file);
if (ret) {
info->status = ret;
- goto next_loop;
+ goto next_fdput;
}
dst_off = info->dest_offset;
@@ -2058,9 +2058,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
next_file:
mnt_drop_write_file(dst_file);
-next_loop:
+next_fdput:
fdput(dst_fd);
-
+next_loop:
if (fatal_signal_pending(current))
goto out;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 70057359fbaf..23148c3ed675 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super,
if (IS_ERR(journal->j_dev_bd)) {
result = PTR_ERR(journal->j_dev_bd);
journal->j_dev_bd = NULL;
- reiserfs_warning(super,
+ reiserfs_warning(super, "sh-457",
"journal_init_dev: Cannot open '%s': %i",
jdev_name, result);
return result;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index bd39a998843d..5089dac02660 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -687,8 +687,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
reiserfs_update_inode_transaction(inode);
reiserfs_update_inode_transaction(dir);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
retval = journal_end(&th);
out_failed:
@@ -771,8 +770,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
goto out_failed;
}
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
retval = journal_end(&th);
out_failed:
@@ -871,8 +869,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
/* the above add_entry did not update dir's stat data */
reiserfs_update_sd(&th, dir);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(dir->i_sb);
@@ -1187,8 +1184,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
goto out_failed;
}
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index fe999157dd97..e39b3910d24d 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -389,27 +389,13 @@ static int show_journal(struct seq_file *m, void *unused)
return 0;
}
-static int r_open(struct inode *inode, struct file *file)
-{
- return single_open(file, PDE_DATA(inode),
- proc_get_parent_data(inode));
-}
-
-static const struct file_operations r_file_operations = {
- .open = r_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static struct proc_dir_entry *proc_info_root = NULL;
static const char proc_info_root_name[] = "fs/reiserfs";
static void add_file(struct super_block *sb, char *name,
int (*func) (struct seq_file *, void *))
{
- proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
- &r_file_operations, func);
+ proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
}
int reiserfs_proc_info_init(struct super_block *sb)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 8f06fd1f3d69..6ccb51993a76 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -213,7 +213,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
unsigned long offset, maxoff;
- struct inode *inode;
+ struct inode *inode = NULL;
struct romfs_inode ri;
const char *name; /* got from dentry */
int len, ret;
@@ -233,7 +233,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
for (;;) {
if (!offset || offset >= maxoff)
- goto out0;
+ break;
ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
if (ret < 0)
@@ -244,37 +244,19 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
len);
if (ret < 0)
goto error;
- if (ret == 1)
+ if (ret == 1) {
+ /* Hard link handling */
+ if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+ offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+ inode = romfs_iget(dir->i_sb, offset);
break;
+ }
/* next entry */
offset = be32_to_cpu(ri.next) & ROMFH_MASK;
}
- /* Hard link handling */
- if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
- offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
- inode = romfs_iget(dir->i_sb, offset);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto error;
- }
- goto outi;
-
- /*
- * it's a bit funky, _lookup needs to return an error code
- * (negative) or a NULL, both as a dentry. ENOENT should not
- * be returned, instead we need to create a negative dentry by
- * d_add(dentry, NULL); and return 0 as no error.
- * (Although as I see, it only matters on writable file
- * systems).
- */
-out0:
- inode = NULL;
-outi:
- d_add(dentry, inode);
- ret = 0;
+ return d_splice_alias(inode, dentry);
error:
return ERR_PTR(ret);
}
diff --git a/fs/select.c b/fs/select.c
index ba879c51288f..bc3cc0f98896 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,6 +34,29 @@
#include <linux/uaccess.h>
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+ if (file->f_op->poll) {
+ return file->f_op->poll(file, pt);
+ } else if (file_has_poll_mask(file)) {
+ unsigned int events = poll_requested_events(pt);
+ struct wait_queue_head *head;
+
+ if (pt && pt->_qproc) {
+ head = file->f_op->get_poll_head(file, events);
+ if (!head)
+ return DEFAULT_POLLMASK;
+ if (IS_ERR(head))
+ return EPOLLERR;
+ pt->_qproc(file, head, pt);
+ }
+
+ return file->f_op->poll_mask(file, events);
+ } else {
+ return DEFAULT_POLLMASK;
+ }
+}
+EXPORT_SYMBOL_GPL(vfs_poll);
/*
* Estimate expected accuracy in ns from a timeval.
@@ -233,7 +256,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
add_wait_queue(wait_address, &entry->wait);
}
-int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
ktime_t *expires, unsigned long slack)
{
int rc = -EINTR;
@@ -258,7 +281,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
return rc;
}
-EXPORT_SYMBOL(poll_schedule_timeout);
/**
* poll_select_set_timeout - helper function to setup the timeout value
@@ -503,14 +525,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
continue;
f = fdget(i);
if (f.file) {
- const struct file_operations *f_op;
- f_op = f.file->f_op;
- mask = DEFAULT_POLLMASK;
- if (f_op->poll) {
- wait_key_set(wait, in, out,
- bit, busy_flag);
- mask = (*f_op->poll)(f.file, wait);
- }
+ wait_key_set(wait, in, out, bit,
+ busy_flag);
+ mask = vfs_poll(f.file, wait);
+
fdput(f);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
@@ -813,34 +831,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll,
__poll_t busy_flag)
{
- __poll_t mask;
- int fd;
-
- mask = 0;
- fd = pollfd->fd;
- if (fd >= 0) {
- struct fd f = fdget(fd);
- mask = EPOLLNVAL;
- if (f.file) {
- /* userland u16 ->events contains POLL... bitmap */
- __poll_t filter = demangle_poll(pollfd->events) |
- EPOLLERR | EPOLLHUP;
- mask = DEFAULT_POLLMASK;
- if (f.file->f_op->poll) {
- pwait->_key = filter;
- pwait->_key |= busy_flag;
- mask = f.file->f_op->poll(f.file, pwait);
- if (mask & busy_flag)
- *can_busy_poll = true;
- }
- /* Mask out unneeded events. */
- mask &= filter;
- fdput(f);
- }
- }
+ int fd = pollfd->fd;
+ __poll_t mask = 0, filter;
+ struct fd f;
+
+ if (fd < 0)
+ goto out;
+ mask = EPOLLNVAL;
+ f = fdget(fd);
+ if (!f.file)
+ goto out;
+
+ /* userland u16 ->events contains POLL... bitmap */
+ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
+ pwait->_key = filter | busy_flag;
+ mask = vfs_poll(f.file, pwait);
+ if (mask & busy_flag)
+ *can_busy_poll = true;
+ mask &= filter; /* Mask out unneeded events. */
+ fdput(f);
+
+out:
/* ... and so does ->revents */
pollfd->revents = mangle_poll(mask);
-
return mask;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eea09f6d8830..4cc090b50cc5 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -6,6 +6,7 @@
* initial implementation -- AV, Oct 2001.
*/
+#include <linux/cache.h>
#include <linux/fs.h>
#include <linux/export.h>
#include <linux/seq_file.h>
@@ -19,6 +20,8 @@
#include <linux/uaccess.h>
#include <asm/page.h>
+static struct kmem_cache *seq_file_cache __ro_after_init;
+
static void seq_set_overflow(struct seq_file *m)
{
m->count = m->size;
@@ -26,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
- return kvmalloc(size, GFP_KERNEL);
+ return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}
/**
@@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op)
WARN_ON(file->private_data);
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
if (!p)
return -ENOMEM;
@@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file)
{
struct seq_file *m = file->private_data;
kvfree(m->buf);
- kfree(m);
+ kmem_cache_free(seq_file_cache, m);
return 0;
}
EXPORT_SYMBOL(seq_release);
@@ -563,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v)
int single_open(struct file *file, int (*show)(struct seq_file *, void *),
void *data)
{
- struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+ struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
int res = -ENOMEM;
if (op) {
@@ -625,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops,
void *private;
struct seq_file *seq;
- private = kzalloc(psize, GFP_KERNEL);
+ private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
if (private == NULL)
goto out;
@@ -673,37 +676,40 @@ void seq_puts(struct seq_file *m, const char *s)
}
EXPORT_SYMBOL(seq_puts);
-/*
+/**
* A helper routine for putting decimal numbers without rich format of printf().
* only 'unsigned long long' is supported.
- * This routine will put strlen(delimiter) + number into seq_file.
+ * @m: seq_file identifying the buffer to which data should be written
+ * @delimiter: a string which is printed before the number
+ * @num: the number
+ * @width: a minimum field width
+ *
+ * This routine will put strlen(delimiter) + number into seq_filed.
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
-void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
- unsigned long long num)
+void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
+ unsigned long long num, unsigned int width)
{
int len;
if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
goto overflow;
- len = strlen(delimiter);
- if (m->count + len >= m->size)
- goto overflow;
+ if (delimiter && delimiter[0]) {
+ if (delimiter[1] == 0)
+ seq_putc(m, delimiter[0]);
+ else
+ seq_puts(m, delimiter);
+ }
- memcpy(m->buf + m->count, delimiter, len);
- m->count += len;
+ if (!width)
+ width = 1;
- if (m->count + 1 >= m->size)
+ if (m->count + width >= m->size)
goto overflow;
- if (num < 10) {
- m->buf[m->count++] = num + '0';
- return;
- }
-
- len = num_to_str(m->buf + m->count, m->size - m->count, num);
+ len = num_to_str(m->buf + m->count, m->size - m->count, num, width);
if (!len)
goto overflow;
@@ -713,8 +719,60 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
overflow:
seq_set_overflow(m);
}
+
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
+ unsigned long long num)
+{
+ return seq_put_decimal_ull_width(m, delimiter, num, 0);
+}
EXPORT_SYMBOL(seq_put_decimal_ull);
+/**
+ * seq_put_hex_ll - put a number in hexadecimal notation
+ * @m: seq_file identifying the buffer to which data should be written
+ * @delimiter: a string which is printed before the number
+ * @v: the number
+ * @width: a minimum field width
+ *
+ * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v)
+ *
+ * This routine is very quick when you show lots of numbers.
+ * In usual cases, it will be better to use seq_printf(). It's easier to read.
+ */
+void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
+ unsigned long long v, unsigned int width)
+{
+ unsigned int len;
+ int i;
+
+ if (delimiter && delimiter[0]) {
+ if (delimiter[1] == 0)
+ seq_putc(m, delimiter[0]);
+ else
+ seq_puts(m, delimiter);
+ }
+
+ /* If x is 0, the result of __builtin_clzll is undefined */
+ if (v == 0)
+ len = 1;
+ else
+ len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4;
+
+ if (len < width)
+ len = width;
+
+ if (m->count + len > m->size) {
+ seq_set_overflow(m);
+ return;
+ }
+
+ for (i = len - 1; i >= 0; i--) {
+ m->buf[m->count + i] = hex_asc[0xf & v];
+ v = v >> 4;
+ }
+ m->count += len;
+}
+
void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
int len;
@@ -722,12 +780,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
goto overflow;
- len = strlen(delimiter);
- if (m->count + len >= m->size)
- goto overflow;
-
- memcpy(m->buf + m->count, delimiter, len);
- m->count += len;
+ if (delimiter && delimiter[0]) {
+ if (delimiter[1] == 0)
+ seq_putc(m, delimiter[0]);
+ else
+ seq_puts(m, delimiter);
+ }
if (m->count + 2 >= m->size)
goto overflow;
@@ -742,7 +800,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
return;
}
- len = num_to_str(m->buf + m->count, m->size - m->count, num);
+ len = num_to_str(m->buf + m->count, m->size - m->count, num, 0);
if (!len)
goto overflow;
@@ -782,8 +840,14 @@ EXPORT_SYMBOL(seq_write);
void seq_pad(struct seq_file *m, char c)
{
int size = m->pad_until - m->count;
- if (size > 0)
- seq_printf(m, "%*s", size, "");
+ if (size > 0) {
+ if (size + m->count > m->size) {
+ seq_set_overflow(m);
+ return;
+ }
+ memset(m->buf + m->count, ' ', size);
+ m->count += size;
+ }
if (c)
seq_putc(m, c);
}
@@ -1040,3 +1104,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
return NULL;
}
EXPORT_SYMBOL(seq_hlist_next_percpu);
+
+void __init seq_file_init(void)
+{
+ seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC);
+}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2187a813376..cbb42f77a2bd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -81,83 +81,86 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait)
static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
siginfo_t const *kinfo)
{
- long err;
+ struct signalfd_siginfo new;
BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
/*
* Unused members should be zero ...
*/
- err = __clear_user(uinfo, sizeof(*uinfo));
+ memset(&new, 0, sizeof(new));
/*
* If you change siginfo_t structure, please be sure
* this code is fixed accordingly.
*/
- err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
- err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
- err |= __put_user(kinfo->si_code, &uinfo->ssi_code);
+ new.ssi_signo = kinfo->si_signo;
+ new.ssi_errno = kinfo->si_errno;
+ new.ssi_code = kinfo->si_code;
switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) {
case SIL_KILL:
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+ new.ssi_pid = kinfo->si_pid;
+ new.ssi_uid = kinfo->si_uid;
break;
case SIL_TIMER:
- err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
- err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
- err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
- err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+ new.ssi_tid = kinfo->si_tid;
+ new.ssi_overrun = kinfo->si_overrun;
+ new.ssi_ptr = (long) kinfo->si_ptr;
+ new.ssi_int = kinfo->si_int;
break;
case SIL_POLL:
- err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
- err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
+ new.ssi_band = kinfo->si_band;
+ new.ssi_fd = kinfo->si_fd;
break;
- case SIL_FAULT:
- err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
-#ifdef __ARCH_SI_TRAPNO
- err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
-#endif
-#ifdef BUS_MCEERR_AO
+ case SIL_FAULT_BNDERR:
+ case SIL_FAULT_PKUERR:
/*
- * Other callers might not initialize the si_lsb field,
- * so check explicitly for the right codes here.
+ * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR
+ * and SIL_FAULT_PKUERR are only generated by faults that
+ * deliver them synchronously to userspace. In case someone
+ * injects one of these signals and signalfd catches it treat
+ * it as SIL_FAULT.
*/
- if (kinfo->si_signo == SIGBUS &&
- kinfo->si_code == BUS_MCEERR_AO)
- err |= __put_user((short) kinfo->si_addr_lsb,
- &uinfo->ssi_addr_lsb);
+ case SIL_FAULT:
+ new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+ new.ssi_trapno = kinfo->si_trapno;
#endif
-#ifdef BUS_MCEERR_AR
- /*
- * Other callers might not initialize the si_lsb field,
- * so check explicitly for the right codes here.
- */
- if (kinfo->si_signo == SIGBUS &&
- kinfo->si_code == BUS_MCEERR_AR)
- err |= __put_user((short) kinfo->si_addr_lsb,
- &uinfo->ssi_addr_lsb);
+ break;
+ case SIL_FAULT_MCEERR:
+ new.ssi_addr = (long) kinfo->si_addr;
+#ifdef __ARCH_SI_TRAPNO
+ new.ssi_trapno = kinfo->si_trapno;
#endif
+ new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
case SIL_CHLD:
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
- err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
- err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
- err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
+ new.ssi_pid = kinfo->si_pid;
+ new.ssi_uid = kinfo->si_uid;
+ new.ssi_status = kinfo->si_status;
+ new.ssi_utime = kinfo->si_utime;
+ new.ssi_stime = kinfo->si_stime;
break;
case SIL_RT:
- default:
/*
* This case catches also the signals queued by sigqueue().
*/
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
- err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
- err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
+ new.ssi_pid = kinfo->si_pid;
+ new.ssi_uid = kinfo->si_uid;
+ new.ssi_ptr = (long) kinfo->si_ptr;
+ new.ssi_int = kinfo->si_int;
+ break;
+ case SIL_SYS:
+ new.ssi_call_addr = (long) kinfo->si_call_addr;
+ new.ssi_syscall = kinfo->si_syscall;
+ new.ssi_arch = kinfo->si_arch;
break;
}
- return err ? -EFAULT: sizeof(*uinfo);
+ if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo)))
+ return -EFAULT;
+
+ return sizeof(*uinfo);
}
static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
diff --git a/fs/super.c b/fs/super.c
index 672538ca9831..50728d9c1a05 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
#include <linux/user_namespace.h>
#include "internal.h"
+static int thaw_super_locked(struct super_block *sb);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
@@ -120,13 +121,23 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
/*
- * Don't call trylock_super as it is a potential
- * scalability bottleneck. The counts could get updated
- * between super_cache_count and super_cache_scan anyway.
- * Call to super_cache_count with shrinker_rwsem held
- * ensures the safety of call to list_lru_shrink_count() and
- * s_op->nr_cached_objects().
+ * We don't call trylock_super() here as it is a scalability bottleneck,
+ * so we're exposed to partial setup state. The shrinker rwsem does not
+ * protect filesystem operations backing list_lru_shrink_count() or
+ * s_op->nr_cached_objects(). Counts can change between
+ * super_cache_count and super_cache_scan, so we really don't need locks
+ * here.
+ *
+ * However, if we are currently mounting the superblock, the underlying
+ * filesystem might be in a state of partial construction and hence it
+ * is dangerous to access it. trylock_super() uses a SB_BORN check to
+ * avoid this situation, so do the same here. The memory barrier is
+ * matched with the one in mount_fs() as we don't hold locks here.
*/
+ if (!(sb->s_flags & SB_BORN))
+ return 0;
+ smp_rmb();
+
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb, sc);
@@ -166,6 +177,7 @@ static void destroy_unused_super(struct super_block *s)
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
+ free_prealloced_shrinker(&s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
}
@@ -251,6 +263,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+ if (prealloc_shrinker(&s->s_shrink))
+ goto fail;
return s;
fail:
@@ -517,11 +531,7 @@ retry:
hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
- err = register_shrinker(&s->s_shrink);
- if (err) {
- deactivate_locked_super(s);
- s = ERR_PTR(err);
- }
+ register_shrinker_prepared(&s->s_shrink);
return s;
}
@@ -574,6 +584,28 @@ void drop_super_exclusive(struct super_block *sb)
}
EXPORT_SYMBOL(drop_super_exclusive);
+static void __iterate_supers(void (*f)(struct super_block *))
+{
+ struct super_block *sb, *p = NULL;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (hlist_unhashed(&sb->s_instances))
+ continue;
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+
+ f(sb);
+
+ spin_lock(&sb_lock);
+ if (p)
+ __put_super(p);
+ p = sb;
+ }
+ if (p)
+ __put_super(p);
+ spin_unlock(&sb_lock);
+}
/**
* iterate_supers - call function for all active superblocks
* @f: function to call
@@ -881,33 +913,22 @@ cancel_readonly:
return retval;
}
-static void do_emergency_remount(struct work_struct *work)
+static void do_emergency_remount_callback(struct super_block *sb)
{
- struct super_block *sb, *p = NULL;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_write(&sb->s_umount);
- if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
- !sb_rdonly(sb)) {
- /*
- * What lock protects sb->s_flags??
- */
- do_remount_sb(sb, SB_RDONLY, NULL, 1);
- }
- up_write(&sb->s_umount);
- spin_lock(&sb_lock);
- if (p)
- __put_super(p);
- p = sb;
+ down_write(&sb->s_umount);
+ if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
+ !sb_rdonly(sb)) {
+ /*
+ * What lock protects sb->s_flags??
+ */
+ do_remount_sb(sb, SB_RDONLY, NULL, 1);
}
- if (p)
- __put_super(p);
- spin_unlock(&sb_lock);
+ up_write(&sb->s_umount);
+}
+
+static void do_emergency_remount(struct work_struct *work)
+{
+ __iterate_supers(do_emergency_remount_callback);
kfree(work);
printk("Emergency Remount complete\n");
}
@@ -923,6 +944,40 @@ void emergency_remount(void)
}
}
+static void do_thaw_all_callback(struct super_block *sb)
+{
+ down_write(&sb->s_umount);
+ if (sb->s_root && sb->s_flags & SB_BORN) {
+ emergency_thaw_bdev(sb);
+ thaw_super_locked(sb);
+ } else {
+ up_write(&sb->s_umount);
+ }
+}
+
+static void do_thaw_all(struct work_struct *work)
+{
+ __iterate_supers(do_thaw_all_callback);
+ kfree(work);
+ printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+ struct work_struct *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(work, do_thaw_all);
+ schedule_work(work);
+ }
+}
+
/*
* Unnamed block devices are dummy devices used by virtual
* filesystems which don't use real block-devices. -- jrs
@@ -1227,6 +1282,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
sb = root->d_sb;
BUG_ON(!sb);
WARN_ON(!sb->s_bdi);
+
+ /*
+ * Write barrier is for super_cache_count(). We place it before setting
+ * SB_BORN as the data dependency between the two functions is the
+ * superblock structure contents that we just set up, not the SB_BORN
+ * flag.
+ */
+ smp_wmb();
sb->s_flags |= SB_BORN;
error = security_sb_kern_mount(sb, flags, secdata);
@@ -1492,11 +1555,10 @@ EXPORT_SYMBOL(freeze_super);
*
* Unlocks the filesystem and marks it writeable again after freeze_super().
*/
-int thaw_super(struct super_block *sb)
+static int thaw_super_locked(struct super_block *sb)
{
int error;
- down_write(&sb->s_umount);
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
up_write(&sb->s_umount);
return -EINVAL;
@@ -1527,4 +1589,10 @@ out:
deactivate_locked_super(sb);
return 0;
}
+
+int thaw_super(struct super_block *sb)
+{
+ down_write(&sb->s_umount);
+ return thaw_super_locked(sb);
+}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index b428d317ae92..92682fcc41f6 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -25,7 +25,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
{
struct dentry *root;
void *ns;
- bool new_sb;
+ bool new_sb = false;
if (!(flags & SB_KERNMOUNT)) {
if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
@@ -35,9 +35,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
root = kernfs_mount_ns(fs_type, flags, sysfs_root,
SYSFS_MAGIC, &new_sb, ns);
- if (IS_ERR(root) || !new_sb)
+ if (!new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
- else if (new_sb)
+ else if (!IS_ERR(root))
root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
return root;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 250b0755b908..4d5d20491ffd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -51,14 +51,9 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un
if (dentry->d_name.len > SYSV_NAMELEN)
return ERR_PTR(-ENAMETOOLONG);
ino = sysv_inode_by_name(dentry);
-
- if (ino) {
+ if (ino)
inode = sysv_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index cdad49da3ff7..d84a2bee4f82 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file)
kfree_rcu(ctx, rcu);
return 0;
}
-
-static __poll_t timerfd_poll(struct file *file, poll_table *wait)
+
+static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
+ __poll_t eventmask)
{
struct timerfd_ctx *ctx = file->private_data;
- __poll_t events = 0;
- unsigned long flags;
- poll_wait(file, &ctx->wqh, wait);
+ return &ctx->wqh;
+}
- spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ctx->ticks)
- events |= EPOLLIN;
- spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
+{
+ struct timerfd_ctx *ctx = file->private_data;
- return events;
+ return ctx->ticks ? EPOLLIN : 0;
}
static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
@@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
static const struct file_operations timerfd_fops = {
.release = timerfd_release,
- .poll = timerfd_poll,
+ .get_poll_head = timerfd_get_poll_head,
+ .poll_mask = timerfd_poll_mask,
.read = timerfd_read,
.llseek = noop_llseek,
.show_fdinfo = timerfd_show,
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 616a688f5d8f..55c508fe8131 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,14 +24,6 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
return ubifs_check_dir_empty(inode) == 0;
}
-static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
-{
- if (S_ISLNK(inode->i_mode))
- return UBIFS_MAX_INO_DATA;
- else
- return UBIFS_MAX_NLEN;
-}
-
int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len, int block)
{
@@ -89,5 +81,5 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
- .max_namelen = ubifs_crypt_max_namelen,
+ .max_namelen = UBIFS_MAX_NLEN,
};
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d7fb88e172e..4e267cc21c77 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -214,7 +214,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
int err;
union ubifs_key key;
struct inode *inode = NULL;
- struct ubifs_dent_node *dent;
+ struct ubifs_dent_node *dent = NULL;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct fscrypt_name nm;
@@ -229,14 +229,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(err);
if (fname_len(&nm) > UBIFS_MAX_NLEN) {
- err = -ENAMETOOLONG;
- goto out_fname;
+ inode = ERR_PTR(-ENAMETOOLONG);
+ goto done;
}
dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
if (!dent) {
- err = -ENOMEM;
- goto out_fname;
+ inode = ERR_PTR(-ENOMEM);
+ goto done;
}
if (nm.hash) {
@@ -250,16 +250,16 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
}
if (err) {
- if (err == -ENOENT) {
+ if (err == -ENOENT)
dbg_gen("not found");
- goto done;
- }
- goto out_dent;
+ else
+ inode = ERR_PTR(err);
+ goto done;
}
if (dbg_check_name(c, dent, &nm)) {
- err = -EINVAL;
- goto out_dent;
+ inode = ERR_PTR(-EINVAL);
+ goto done;
}
inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
@@ -272,7 +272,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
ubifs_err(c, "dead directory entry '%pd', error %d",
dentry, err);
ubifs_ro_mode(c, err);
- goto out_dent;
+ goto done;
}
if (ubifs_crypt_is_encrypted(dir) &&
@@ -280,27 +280,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
!fscrypt_has_permitted_context(dir, inode)) {
ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
dir->i_ino, inode->i_ino);
- err = -EPERM;
- goto out_inode;
+ iput(inode);
+ inode = ERR_PTR(-EPERM);
}
done:
kfree(dent);
fscrypt_free_filename(&nm);
- /*
- * Note, d_splice_alias() would be required instead if we supported
- * NFS.
- */
- d_add(dentry, inode);
- return NULL;
-
-out_inode:
- iput(inode);
-out_dent:
- kfree(dent);
-out_fname:
- fscrypt_free_filename(&nm);
- return ERR_PTR(err);
+ return d_splice_alias(inode, dentry);
}
static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index cf348ba99238..1acb2ff505e6 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1256,7 +1256,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
* Inode length changed, so we have to make sure
* @I_DIRTY_DATASYNC is set.
*/
- __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
else
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 2dcf3d473fec..9571616b5dda 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -632,7 +632,7 @@ static int scan_for_idx_cb(struct ubifs_info *c,
*/
static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
{
- struct ubifs_lprops *lprops;
+ const struct ubifs_lprops *lprops;
struct scan_data data;
int err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 6c3a1abd0e22..f5a46844340c 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -244,7 +244,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c,
/**
* lpt_heap_replace - replace lprops in a category heap.
* @c: UBIFS file-system description object
- * @old_lprops: LEB properties to replace
* @new_lprops: LEB properties with which to replace
* @cat: LEB category
*
@@ -254,7 +253,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c,
* lprops. This function does that.
*/
static void lpt_heap_replace(struct ubifs_info *c,
- struct ubifs_lprops *old_lprops,
struct ubifs_lprops *new_lprops, int cat)
{
struct ubifs_lpt_heap *heap;
@@ -362,7 +360,7 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
case LPROPS_DIRTY:
case LPROPS_DIRTY_IDX:
case LPROPS_FREE:
- lpt_heap_replace(c, old_lprops, new_lprops, cat);
+ lpt_heap_replace(c, new_lprops, cat);
break;
case LPROPS_UNCAT:
case LPROPS_EMPTY:
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index aab87340d3de..16f03d9929e5 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -175,7 +175,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
int lnum, int offs)
{
- lnum = lnum;
dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
ubifs_assert(offs % c->min_io_size == 0);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b16ef162344a..6c397a389105 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1737,8 +1737,11 @@ static void ubifs_remount_ro(struct ubifs_info *c)
dbg_save_space_info(c);
- for (i = 0; i < c->jhead_cnt; i++)
- ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ ubifs_ro_mode(c, err);
+ }
c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
@@ -1804,8 +1807,11 @@ static void ubifs_put_super(struct super_block *sb)
int err;
/* Synchronize write-buffers */
- for (i = 0; i < c->jhead_cnt; i++)
- ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ ubifs_ro_mode(c, err);
+ }
/*
* We are being cleanly unmounted which means the
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 0458dd47e105..c586026508db 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -622,8 +622,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
}
@@ -733,8 +732,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inc_nlink(dir);
dir->i_ctime = dir->i_mtime = current_time(dir);
mark_inode_dirty(dir);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index f897e55f2cd0..16a8ad21b77e 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,6 +28,9 @@
#include "udf_sb.h"
+#define SURROGATE_MASK 0xfffff800
+#define SURROGATE_PAIR 0x0000d800
+
static int udf_uni2char_utf8(wchar_t uni,
unsigned char *out,
int boundlen)
@@ -37,6 +40,9 @@ static int udf_uni2char_utf8(wchar_t uni,
if (boundlen <= 0)
return -ENAMETOOLONG;
+ if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
+ return -EINVAL;
+
if (uni < 0x80) {
out[u_len++] = (unsigned char)uni;
} else if (uni < 0x800) {
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 32545cd00ceb..d5f43ba76c59 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -39,8 +39,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
{
int err = ufs_add_link(dentry, inode);
if (!err) {
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
}
inode_dec_link_count(inode);
@@ -193,8 +192,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
if (err)
goto out_fail;
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
+ d_instantiate_new(dentry, inode);
return 0;
out_fail:
diff --git a/fs/xattr.c b/fs/xattr.c
index 61cd28ba25f3..f9cb1db187b7 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -229,7 +229,7 @@ out:
}
EXPORT_SYMBOL_GPL(vfs_setxattr);
-ssize_t
+static ssize_t
xattr_getsecurity(struct inode *inode, const char *name, void *value,
size_t size)
{
@@ -254,7 +254,6 @@ out:
out_noalloc:
return len;
}
-EXPORT_SYMBOL_GPL(xattr_getsecurity);
/*
* vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
@@ -354,7 +353,6 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size)
if (error)
return error;
if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) {
- error = -EOPNOTSUPP;
error = inode->i_op->listxattr(dentry, list, size);
} else {
error = security_inode_listsecurity(inode, list, size);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 46bcf0e649f5..457ac9f97377 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -85,6 +85,24 @@ config XFS_ONLINE_SCRUB
If unsure, say N.
+config XFS_ONLINE_REPAIR
+ bool "XFS online metadata repair support"
+ default n
+ depends on XFS_FS && XFS_ONLINE_SCRUB
+ help
+ If you say Y here you will be able to repair metadata on a
+ mounted XFS filesystem. This feature is intended to reduce
+ filesystem downtime by fixing minor problems before they cause the
+ filesystem to go down. However, it requires that the filesystem be
+ formatted with secondary metadata, such as reverse mappings and inode
+ parent pointers.
+
+ This feature is considered EXPERIMENTAL. Use with caution!
+
+ See the xfs_scrub man page in section 8 for additional information.
+
+ If unsure, say N.
+
config XFS_WARN
bool "XFS Verbose Warnings"
depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7ceb41a9786a..e8d67a443bd7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -28,6 +28,7 @@ xfs-y += xfs_trace.o
# build the libxfs code first
xfs-y += $(addprefix libxfs/, \
+ xfs_ag.o \
xfs_alloc.o \
xfs_alloc_btree.o \
xfs_attr.o \
@@ -163,4 +164,12 @@ xfs-y += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o
xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+
+# online repair
+ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
+xfs-y += $(addprefix scrub/, \
+ agheader_repair.o \
+ repair.o \
+ )
+endif
endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
new file mode 100644
index 000000000000..9345802c99f7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -0,0 +1,464 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct xfs_buf *
+xfs_get_aghdr_buf(
+ struct xfs_mount *mp,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ int flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ if (!bp)
+ return NULL;
+
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ bp->b_bn = blkno;
+ bp->b_maps[0].bm_bn = blkno;
+ bp->b_ops = ops;
+
+ return bp;
+}
+
+/*
+ * Generic btree root block init function
+ */
+static void
+xfs_btroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+}
+
+/*
+ * Alloc btree root block init functions
+ */
+static void
+xfs_bnoroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_alloc_rec *arec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+ arec->ar_blockcount = cpu_to_be32(id->agsize -
+ be32_to_cpu(arec->ar_startblock));
+}
+
+static void
+xfs_cntroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_alloc_rec *arec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+ arec->ar_blockcount = cpu_to_be32(id->agsize -
+ be32_to_cpu(arec->ar_startblock));
+}
+
+/*
+ * Reverse map root block init
+ */
+static void
+xfs_rmaproot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_rmap_rec *rrec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
+
+ /*
+ * mark the AG header regions as static metadata The BNO
+ * btree block is the first block after the headers, so
+ * it's location defines the size of region the static
+ * metadata consumes.
+ *
+ * Note: unlike mkfs, we never have to account for log
+ * space when growing the data regions
+ */
+ rrec = XFS_RMAP_REC_ADDR(block, 1);
+ rrec->rm_startblock = 0;
+ rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+ rrec->rm_offset = 0;
+
+ /* account freespace btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 2);
+ rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(2);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+
+ /* account inode btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 3);
+ rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+ XFS_IBT_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+ rrec->rm_offset = 0;
+
+ /* account for rmap btree root */
+ rrec = XFS_RMAP_REC_ADDR(block, 4);
+ rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+
+ /* account for refc btree root */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ rrec = XFS_RMAP_REC_ADDR(block, 5);
+ rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+}
+
+/*
+ * Initialise new secondary superblocks with the pre-grow geometry, but mark
+ * them as "in progress" so we know they haven't yet been activated. This will
+ * get cleared when the update with the new geometry information is done after
+ * changes to the primary are committed. This isn't strictly necessary, but we
+ * get it for free with the delayed buffer write lists and it means we can tell
+ * if a grow operation didn't complete properly after the fact.
+ */
+static void
+xfs_sbblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+
+ xfs_sb_to_disk(dsb, &mp->m_sb);
+ dsb->sb_inprogress = 1;
+}
+
+static void
+xfs_agfblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ xfs_extlen_t tmpsize;
+
+ agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+ agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+ agf->agf_seqno = cpu_to_be32(id->agno);
+ agf->agf_length = cpu_to_be32(id->agsize);
+ agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+ agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ agf->agf_roots[XFS_BTNUM_RMAPi] =
+ cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_blocks = cpu_to_be32(1);
+ }
+
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_fllast = 0;
+ agf->agf_flcount = 0;
+ tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
+ agf->agf_freeblks = cpu_to_be32(tmpsize);
+ agf->agf_longest = cpu_to_be32(tmpsize);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agf->agf_refcount_root = cpu_to_be32(
+ xfs_refc_block(mp));
+ agf->agf_refcount_level = cpu_to_be32(1);
+ agf->agf_refcount_blocks = cpu_to_be32(1);
+ }
+}
+
+static void
+xfs_agflblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno;
+ int bucket;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(id->agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+ }
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+ for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
+ agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+}
+
+static void
+xfs_agiblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ int bucket;
+
+ agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+ agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+ agi->agi_seqno = cpu_to_be32(id->agno);
+ agi->agi_length = cpu_to_be32(id->agsize);
+ agi->agi_count = 0;
+ agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ agi->agi_level = cpu_to_be32(1);
+ agi->agi_freecount = 0;
+ agi->agi_newino = cpu_to_be32(NULLAGINO);
+ agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+ agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+}
+
+typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
+ struct aghdr_init_data *id);
+static int
+xfs_ag_init_hdr(
+ struct xfs_mount *mp,
+ struct aghdr_init_data *id,
+ aghdr_init_work_f work,
+ const struct xfs_buf_ops *ops)
+
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ if (!bp)
+ return -ENOMEM;
+
+ (*work)(mp, bp, id);
+
+ xfs_buf_delwri_queue(bp, &id->buffer_list);
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+struct xfs_aghdr_grow_data {
+ xfs_daddr_t daddr;
+ size_t numblks;
+ const struct xfs_buf_ops *ops;
+ aghdr_init_work_f work;
+ xfs_btnum_t type;
+ bool need_init;
+};
+
+/*
+ * Prepare new AG headers to be written to disk. We use uncached buffers here,
+ * as it is assumed these new AG headers are currently beyond the currently
+ * valid filesystem address space. Using cached buffers would trip over EOFS
+ * corruption detection alogrithms in the buffer cache lookup routines.
+ *
+ * This is a non-transactional function, but the prepared buffers are added to a
+ * delayed write buffer list supplied by the caller so they can submit them to
+ * disk and wait on them as required.
+ */
+int
+xfs_ag_init_headers(
+ struct xfs_mount *mp,
+ struct aghdr_init_data *id)
+
+{
+ struct xfs_aghdr_grow_data aghdr_data[] = {
+ { /* SB */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_sb_buf_ops,
+ .work = &xfs_sbblock_init,
+ .need_init = true
+ },
+ { /* AGF */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agf_buf_ops,
+ .work = &xfs_agfblock_init,
+ .need_init = true
+ },
+ { /* AGFL */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agfl_buf_ops,
+ .work = &xfs_agflblock_init,
+ .need_init = true
+ },
+ { /* AGI */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agi_buf_ops,
+ .work = &xfs_agiblock_init,
+ .need_init = true
+ },
+ { /* BNO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_allocbt_buf_ops,
+ .work = &xfs_bnoroot_init,
+ .need_init = true
+ },
+ { /* CNT root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_allocbt_buf_ops,
+ .work = &xfs_cntroot_init,
+ .need_init = true
+ },
+ { /* INO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_inobt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_INO,
+ .need_init = true
+ },
+ { /* FINO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_inobt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_FINO,
+ .need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
+ },
+ { /* RMAP root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_rmapbt_buf_ops,
+ .work = &xfs_rmaproot_init,
+ .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
+ },
+ { /* REFC root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_refcountbt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_REFC,
+ .need_init = xfs_sb_version_hasreflink(&mp->m_sb)
+ },
+ { /* NULL terminating block */
+ .daddr = XFS_BUF_DADDR_NULL,
+ }
+ };
+ struct xfs_aghdr_grow_data *dp;
+ int error = 0;
+
+ /* Account for AG free space in new AG */
+ id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+ for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
+ if (!dp->need_init)
+ continue;
+
+ id->daddr = dp->daddr;
+ id->numblks = dp->numblks;
+ id->type = dp->type;
+ error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
+ if (error)
+ break;
+ }
+ return error;
+}
+
+/*
+ * Extent the AG indicated by the @id by the length passed in
+ */
+int
+xfs_ag_extend_space(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct aghdr_init_data *id,
+ xfs_extlen_t len)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_buf *bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ int error;
+
+ /*
+ * Change the agi length.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(bp);
+ be32_add_cpu(&agi->agi_length, len);
+ ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
+ be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
+ xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+
+ /*
+ * Change agf length.
+ */
+ error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+ if (error)
+ return error;
+
+ agf = XFS_BUF_TO_AGF(bp);
+ be32_add_cpu(&agf->agf_length, len);
+ ASSERT(agf->agf_length == agi->agi_length);
+ xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
+ /*
+ * Free the new space.
+ *
+ * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+ * this doesn't actually exist in the rmap btree.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+ error = xfs_rmap_free(tp, bp, id->agno,
+ be32_to_cpu(agf->agf_length) - len,
+ len, &oinfo);
+ if (error)
+ return error;
+
+ return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+ be32_to_cpu(agf->agf_length) - len),
+ len, &oinfo, XFS_AG_RESV_NONE);
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000000..412702e23f61
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __LIBXFS_AG_H
+#define __LIBXFS_AG_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+
+struct aghdr_init_data {
+ /* per ag data */
+ xfs_agblock_t agno; /* ag to init */
+ xfs_extlen_t agsize; /* new AG size */
+ struct list_head buffer_list; /* buffer writeback list */
+ xfs_rfsblock_t nfree; /* cumulative new free space */
+
+ /* per header data */
+ xfs_daddr_t daddr; /* header location */
+ size_t numblks; /* size of header */
+ xfs_btnum_t type; /* type of btree root block */
+};
+
+int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct aghdr_init_data *id, xfs_extlen_t len);
+
+#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 39387bdd225d..dc9dd3805d97 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -39,6 +39,9 @@
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_ag_resv.h"
+#include "xfs_bmap.h"
+
+extern kmem_zone_t *xfs_bmap_free_item_zone;
struct workqueue_struct *xfs_alloc_wq;
@@ -1947,7 +1950,7 @@ void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
+ mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
(mp->m_sb.sb_agblocks + 1) / 2);
}
@@ -1959,7 +1962,6 @@ xfs_alloc_compute_maxlevels(
*/
xfs_extlen_t
xfs_alloc_longest_free_extent(
- struct xfs_mount *mp,
struct xfs_perag *pag,
xfs_extlen_t need,
xfs_extlen_t reserved)
@@ -2038,8 +2040,7 @@ xfs_alloc_space_available(
/* do we have enough contiguous free space for the allocation? */
alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
- longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
- reservation);
+ longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
if (longest < alloc_len)
return false;
@@ -2062,6 +2063,30 @@ xfs_alloc_space_available(
return true;
}
+int
+xfs_free_agfl_block(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ struct xfs_buf *agbp,
+ struct xfs_owner_info *oinfo)
+{
+ int error;
+ struct xfs_buf *bp;
+
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
+ XFS_AG_RESV_AGFL);
+ if (error)
+ return error;
+
+ bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0);
+ if (!bp)
+ return -EFSCORRUPTED;
+ xfs_trans_binval(tp, bp);
+
+ return 0;
+}
+
/*
* Check the agfl fields of the agf for inconsistency or corruption. The purpose
* is to detect an agfl header padding mismatch between current and early v5
@@ -2150,6 +2175,40 @@ xfs_agfl_reset(
}
/*
+ * Defer an AGFL block free. This is effectively equivalent to
+ * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ *
+ * Deferring AGFL frees helps prevent log reservation overruns due to too many
+ * allocation operations in a transaction. AGFL frees are prone to this problem
+ * because for one they are always freed one at a time. Further, an immediate
+ * AGFL block free can cause a btree join and require another block free before
+ * the real allocation can proceed. Deferring the free disconnects freeing up
+ * the AGFL slot from freeing the block.
+ */
+STATIC void
+xfs_defer_agfl_block(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno,
+ xfs_fsblock_t agbno,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_extent_free_item *new; /* new element */
+
+ ASSERT(xfs_bmap_free_item_zone != NULL);
+ ASSERT(oinfo != NULL);
+
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+ new->xefi_blockcount = 1;
+ new->xefi_oinfo = *oinfo;
+
+ trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+}
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -2249,21 +2308,20 @@ xfs_alloc_fix_freelist(
else
xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
- struct xfs_buf *bp;
-
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
- error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
- &targs.oinfo, XFS_AG_RESV_AGFL);
- if (error)
- goto out_agbp_relse;
- bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
- if (!bp) {
- error = -EFSCORRUPTED;
- goto out_agbp_relse;
+
+ /* defer agfl frees if dfops is provided */
+ if (tp->t_agfl_dfops) {
+ xfs_defer_agfl_block(mp, tp->t_agfl_dfops, args->agno,
+ bno, &targs.oinfo);
+ } else {
+ error = xfs_free_agfl_block(tp, args->agno, bno, agbp,
+ &targs.oinfo);
+ if (error)
+ goto out_agbp_relse;
}
- xfs_trans_binval(tp, bp);
}
targs.tp = tp;
@@ -2951,18 +3009,20 @@ out:
* after fixing up the freelist.
*/
int /* error */
-xfs_free_extent(
+__xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
struct xfs_owner_info *oinfo, /* extent owner */
- enum xfs_ag_resv_type type) /* block reservation type */
+ enum xfs_ag_resv_type type, /* block reservation type */
+ bool skip_discard)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
int error;
+ unsigned int busy_flags = 0;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -2986,7 +3046,9 @@ xfs_free_extent(
if (error)
goto err;
- xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+ if (skip_discard)
+ busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
+ xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
return 0;
err:
@@ -3118,3 +3180,40 @@ xfs_alloc_has_record(
return xfs_btree_has_record(cur, &low, &high, exists);
}
+
+/*
+ * Walk all the blocks in the AGFL. The @walk_fn can return any negative
+ * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ */
+int
+xfs_agfl_walk(
+ struct xfs_mount *mp,
+ struct xfs_agf *agf,
+ struct xfs_buf *agflbp,
+ xfs_agfl_walk_fn walk_fn,
+ void *priv)
+{
+ __be32 *agfl_bno;
+ unsigned int i;
+ int error;
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ i = be32_to_cpu(agf->agf_flfirst);
+
+ /* Nothing to walk in an empty AGFL. */
+ if (agf->agf_flcount == cpu_to_be32(0))
+ return 0;
+
+ /* Otherwise, walk from first to last, wrapping as needed. */
+ for (;;) {
+ error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (i == be32_to_cpu(agf->agf_fllast))
+ break;
+ if (++i == xfs_agfl_size(mp))
+ i = 0;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index a311a2414a6b..0747adcd57d6 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -116,9 +116,8 @@ xfs_alloc_allow_busy_reuse(int datatype)
unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
-xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
- struct xfs_perag *pag, xfs_extlen_t need,
- xfs_extlen_t reserved);
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
+ xfs_extlen_t need, xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
@@ -192,12 +191,24 @@ xfs_alloc_vextent(
* Free an extent.
*/
int /* error */
-xfs_free_extent(
+__xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
struct xfs_owner_info *oinfo, /* extent owner */
- enum xfs_ag_resv_type type); /* block reservation type */
+ enum xfs_ag_resv_type type, /* block reservation type */
+ bool skip_discard);
+
+static inline int
+xfs_free_extent(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+}
int /* error */
xfs_alloc_lookup_le(
@@ -224,6 +235,8 @@ int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
+ struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **agbp);
@@ -249,4 +262,9 @@ bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, bool *exist);
+typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
+ void *priv);
+int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
+ struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index b451649ba176..18aec7a0e599 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -547,3 +547,12 @@ xfs_allocbt_maxrecs(
return blocklen / sizeof(xfs_alloc_rec_t);
return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
}
+
+/* Calculate the freespace btree size for some records. */
+xfs_extlen_t
+xfs_allocbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_alloc_mnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..2fd54728871c 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -61,5 +61,7 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index ce4a34a2751d..c3d02a66d39d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -236,7 +236,7 @@ xfs_attr_set(
args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
args.total = xfs_attr_calc_size(&args, &local);
- error = xfs_qm_dqattach(dp, 0);
+ error = xfs_qm_dqattach(dp);
if (error)
return error;
@@ -427,7 +427,7 @@ xfs_attr_remove(
*/
args.op_flags = XFS_DA_OP_OKNOENT;
- error = xfs_qm_dqattach(dp, 0);
+ error = xfs_qm_dqattach(dp);
if (error)
return error;
@@ -511,7 +511,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
if (args->flags & ATTR_CREATE)
return retval;
retval = xfs_attr_shortform_remove(args);
- ASSERT(retval == 0);
+ if (retval)
+ return retval;
+ /*
+ * Since we have removed the old attr, clear ATTR_REPLACE so
+ * that the leaf format add routine won't trip over the attr
+ * not being around.
+ */
+ args->flags &= ~ATTR_REPLACE;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 21be186067a2..83a6d3c7f872 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -620,7 +620,7 @@ xfs_attr_rmtval_remove(
/*
* If the "remote" value is in the cache, remove it.
*/
- bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+ bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
if (bp) {
xfs_buf_stale(bp);
xfs_buf_relse(bp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 3b03d886df66..7b0e2b551e23 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -246,7 +246,7 @@ xfs_bmap_get_bp(
struct xfs_btree_cur *cur,
xfs_fsblock_t bno)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
int i;
if (!cur)
@@ -260,9 +260,9 @@ xfs_bmap_get_bp(
}
/* Chase down all the log items to see if the bp is there */
- list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
- struct xfs_buf_log_item *bip;
- bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) {
+ struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip;
+
if (bip->bli_item.li_type == XFS_LI_BUF &&
XFS_BUF_ADDR(bip->bli_buf) == bno)
return bip->bli_buf;
@@ -312,8 +312,9 @@ xfs_check_block(
xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
__func__, j, i,
(unsigned long long)be64_to_cpu(*thispa));
- panic("%s: ptrs are equal in node\n",
+ xfs_err(mp, "%s: ptrs are equal in node\n",
__func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
}
}
@@ -483,7 +484,8 @@ error0:
error_norelse:
xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
__func__, i);
- panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return;
}
@@ -542,12 +544,13 @@ xfs_bmap_validate_ret(
* The list is maintained sorted (by block number).
*/
void
-xfs_bmap_add_free(
+__xfs_bmap_add_free(
struct xfs_mount *mp,
struct xfs_defer_ops *dfops,
xfs_fsblock_t bno,
xfs_filblks_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_owner_info *oinfo,
+ bool skip_discard)
{
struct xfs_extent_free_item *new; /* new element */
#ifdef DEBUG
@@ -574,6 +577,7 @@ xfs_bmap_add_free(
new->xefi_oinfo = *oinfo;
else
xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+ new->xefi_skip_discard = skip_discard;
trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0,
XFS_FSB_TO_AGBNO(mp, bno), len);
xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
@@ -725,12 +729,16 @@ xfs_bmap_extents_to_btree(
*logflagsp = 0;
if ((error = xfs_alloc_vextent(&args))) {
xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return -ENOSPC;
}
@@ -1997,10 +2005,13 @@ xfs_bmap_add_extent_delay_real(
ASSERT(0);
}
- /* add reverse mapping */
- error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
- if (error)
- goto done;
+ /* add reverse mapping unless caller opted out */
+ if (!(bma->flags & XFS_BMAPI_NORMAP)) {
+ error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip,
+ whichfork, new);
+ if (error)
+ goto done;
+ }
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2664,7 +2675,8 @@ xfs_bmap_add_extent_hole_real(
struct xfs_bmbt_irec *new,
xfs_fsblock_t *first,
struct xfs_defer_ops *dfops,
- int *logflagsp)
+ int *logflagsp,
+ int flags)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
@@ -2841,10 +2853,12 @@ xfs_bmap_add_extent_hole_real(
break;
}
- /* add reverse mapping */
- error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
- if (error)
- goto done;
+ /* add reverse mapping unless caller opted out */
+ if (!(flags & XFS_BMAPI_NORMAP)) {
+ error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
+ if (error)
+ goto done;
+ }
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -3225,7 +3239,7 @@ xfs_bmap_longest_free_extent(
}
}
- longest = xfs_alloc_longest_free_extent(mp, pag,
+ longest = xfs_alloc_longest_free_extent(pag,
xfs_alloc_min_freelist(mp, pag),
xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
@@ -4119,7 +4133,8 @@ xfs_bmapi_allocate(
else
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
whichfork, &bma->icur, &bma->cur, &bma->got,
- bma->firstblock, bma->dfops, &bma->logflags);
+ bma->firstblock, bma->dfops, &bma->logflags,
+ bma->flags);
bma->logflags |= tmp_logflags;
if (error)
@@ -4505,30 +4520,37 @@ error0:
return error;
}
-static int
+int
xfs_bmapi_remap(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- struct xfs_defer_ops *dfops)
+ struct xfs_defer_ops *dfops,
+ int flags)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp;
struct xfs_btree_cur *cur = NULL;
xfs_fsblock_t firstblock = NULLFSBLOCK;
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
+ int whichfork = xfs_bmapi_whichfork(flags);
int logflags = 0, error;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_NORMAP)));
+ ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
+ (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
@@ -4538,7 +4560,7 @@ xfs_bmapi_remap(
return -EIO;
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
}
@@ -4553,7 +4575,7 @@ xfs_bmapi_remap(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (ifp->if_flags & XFS_IFBROOT) {
- cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = firstblock;
cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
@@ -4562,18 +4584,21 @@ xfs_bmapi_remap(
got.br_startoff = bno;
got.br_startblock = startblock;
got.br_blockcount = len;
- got.br_state = XFS_EXT_NORM;
+ if (flags & XFS_BMAPI_PREALLOC)
+ got.br_state = XFS_EXT_UNWRITTEN;
+ else
+ got.br_state = XFS_EXT_NORM;
- error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur,
- &cur, &got, &firstblock, dfops, &logflags);
+ error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur,
+ &cur, &got, &firstblock, dfops, &logflags, flags);
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
int tmp_logflags = 0;
error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, XFS_DATA_FORK);
+ &tmp_logflags, whichfork);
logflags |= tmp_logflags;
}
@@ -5100,9 +5125,12 @@ xfs_bmap_del_extent_real(
error = xfs_refcount_decrease_extent(mp, dfops, del);
if (error)
goto done;
- } else
- xfs_bmap_add_free(mp, dfops, del->br_startblock,
- del->br_blockcount, NULL);
+ } else {
+ __xfs_bmap_add_free(mp, dfops, del->br_startblock,
+ del->br_blockcount, NULL,
+ (bflags & XFS_BMAPI_NODISCARD) ||
+ del->br_state == XFS_EXT_UNWRITTEN);
+ }
}
/*
@@ -5667,7 +5695,6 @@ xfs_bmap_collapse_extents(
xfs_fileoff_t *next_fsb,
xfs_fileoff_t offset_shift_fsb,
bool *done,
- xfs_fileoff_t stop_fsb,
xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops)
{
@@ -6145,7 +6172,7 @@ xfs_bmap_finish_one(
switch (type) {
case XFS_BMAP_MAP:
error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
- startblock, dfops);
+ startblock, dfops, 0);
*blockcount = 0;
break;
case XFS_BMAP_UNMAP:
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f3be6416260b..2c233f9f1a26 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -68,6 +68,7 @@ struct xfs_extent_free_item
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
struct list_head xefi_list;
struct xfs_owner_info xefi_oinfo; /* extent owner */
+ bool xefi_skip_discard;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -116,6 +117,12 @@ struct xfs_extent_free_item
/* Only convert unwritten extents, don't allocate new blocks */
#define XFS_BMAPI_CONVERT_ONLY 0x800
+/* Skip online discard of freed extents */
+#define XFS_BMAPI_NODISCARD 0x1000
+
+/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
+#define XFS_BMAPI_NORMAP 0x2000
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -128,7 +135,9 @@ struct xfs_extent_free_item
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }
+ { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
+ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \
+ { XFS_BMAPI_NORMAP, "NORMAP" }
static inline int xfs_bmapi_aflag(int w)
@@ -192,9 +201,9 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+void __xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
xfs_fsblock_t bno, xfs_filblks_t len,
- struct xfs_owner_info *oinfo);
+ struct xfs_owner_info *oinfo, bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -228,7 +237,7 @@ void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
- bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+ bool *done, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops);
int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
@@ -240,6 +249,17 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+static inline void
+xfs_bmap_add_free(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ struct xfs_owner_info *oinfo)
+{
+ __xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false);
+}
+
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
XFS_BMAP_UNMAP,
@@ -277,4 +297,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork)
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
+int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
+ struct xfs_defer_ops *dfops, int flags);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d89d06bea6e3..ac9d4aeedb09 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -660,3 +660,12 @@ xfs_bmbt_change_owner(
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
return error;
}
+
+/* Calculate the bmap btree size for some records. */
+unsigned long long
+xfs_bmbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index e4505746ccaa..fb3cd2d9e0f8 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -118,4 +118,7 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index edc0193358a5..c825c8182b30 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4531,7 +4531,6 @@ xfs_btree_sblock_verify(
*/
uint
xfs_btree_compute_maxlevels(
- struct xfs_mount *mp,
uint *limits,
unsigned long len)
{
@@ -4837,15 +4836,14 @@ xfs_btree_query_all(
* Calculate the number of blocks needed to store a given number of records
* in a short-format (per-AG metadata) btree.
*/
-xfs_extlen_t
+unsigned long long
xfs_btree_calc_size(
- struct xfs_mount *mp,
uint *limits,
unsigned long long len)
{
int level;
int maxrecs;
- xfs_extlen_t rval;
+ unsigned long long rval;
maxrecs = limits[0];
for (level = 0, rval = 0; len > 1; level++) {
@@ -4921,3 +4919,24 @@ xfs_btree_has_record(
*exists = false;
return error;
}
+
+/* Are there more records in this btree? */
+bool
+xfs_btree_has_more_records(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+ /* There are still records in this block. */
+ if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+ return true;
+
+ /* There are more record blocks. */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
+ else
+ return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 58e30c0975c3..d7911efee6dc 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -481,10 +481,8 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
- unsigned long len);
-xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
- unsigned long long len);
+uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
+unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
/* return codes */
#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
@@ -530,5 +528,6 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
union xfs_btree_irec *high, bool *exists);
+bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 087fea02c389..3daf175e2535 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -220,7 +220,7 @@ xfs_defer_trans_abort(
{
struct xfs_defer_pending *dfp;
- trace_xfs_defer_trans_abort(tp->t_mountp, dop);
+ trace_xfs_defer_trans_abort(tp->t_mountp, dop, _RET_IP_);
/* Abort intent items that don't have a done item. */
list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
@@ -253,7 +253,7 @@ xfs_defer_trans_roll(
for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++)
xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]);
- trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
+ trace_xfs_defer_trans_roll((*tp)->t_mountp, dop, _RET_IP_);
/* Roll the transaction. */
error = xfs_trans_roll(tp);
@@ -352,10 +352,21 @@ xfs_defer_finish(
void *state;
int error = 0;
void (*cleanup_fn)(struct xfs_trans *, void *, int);
+ struct xfs_defer_ops *orig_dop;
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- trace_xfs_defer_finish((*tp)->t_mountp, dop);
+ trace_xfs_defer_finish((*tp)->t_mountp, dop, _RET_IP_);
+
+ /*
+ * Attach dfops to the transaction during deferred ops processing. This
+ * explicitly causes calls into the allocator to defer AGFL block frees.
+ * Note that this code can go away once all dfops users attach to the
+ * associated tp.
+ */
+ ASSERT(!(*tp)->t_agfl_dfops || ((*tp)->t_agfl_dfops == dop));
+ orig_dop = (*tp)->t_agfl_dfops;
+ (*tp)->t_agfl_dfops = dop;
/* Until we run out of pending work to finish... */
while (xfs_defer_has_unfinished_work(dop)) {
@@ -428,10 +439,11 @@ xfs_defer_finish(
}
out:
+ (*tp)->t_agfl_dfops = orig_dop;
if (error)
trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
else
- trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
+ trace_xfs_defer_finish_done((*tp)->t_mountp, dop, _RET_IP_);
return error;
}
@@ -447,7 +459,7 @@ xfs_defer_cancel(
struct list_head *pwi;
struct list_head *n;
- trace_xfs_defer_cancel(NULL, dop);
+ trace_xfs_defer_cancel(NULL, dop, _RET_IP_);
/*
* Free the pending items. Caller should already have arranged
@@ -532,5 +544,5 @@ xfs_defer_init(
*fbp = NULLFSBLOCK;
INIT_LIST_HEAD(&dop->dop_intake);
INIT_LIST_HEAD(&dop->dop_pending);
- trace_xfs_defer_init(NULL, dop);
+ trace_xfs_defer_init(NULL, dop, _RET_IP_);
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 045beacdd37d..e70725ba1f5f 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -55,6 +55,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_REFCOUNT,
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
+ XFS_DEFER_OPS_TYPE_AGFL_FREE,
XFS_DEFER_OPS_TYPE_MAX,
};
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 8b7a6c3cb599..cce520becee4 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -41,14 +41,18 @@ xfs_calc_dquots_per_chunk(
/*
* Do some primitive error checking on ondisk dquot data structures.
+ *
+ * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure;
+ * we verify them separately because at some points we have only the
+ * smaller xfs_disk_dquot structure available.
*/
+
xfs_failaddr_t
xfs_dquot_verify(
struct xfs_mount *mp,
xfs_disk_dquot_t *ddq,
xfs_dqid_t id,
- uint type, /* used only when IO_dorepair is true */
- uint flags)
+ uint type) /* used only during quotacheck */
{
/*
* We can encounter an uninitialized dquot buffer for 2 reasons:
@@ -70,6 +74,8 @@ xfs_dquot_verify(
if (ddq->d_version != XFS_DQUOT_VERSION)
return __this_address;
+ if (type && ddq->d_flags != type)
+ return __this_address;
if (ddq->d_flags != XFS_DQ_USER &&
ddq->d_flags != XFS_DQ_PROJ &&
ddq->d_flags != XFS_DQ_GROUP)
@@ -99,33 +105,44 @@ xfs_dquot_verify(
return NULL;
}
+xfs_failaddr_t
+xfs_dqblk_verify(
+ struct xfs_mount *mp,
+ struct xfs_dqblk *dqb,
+ xfs_dqid_t id,
+ uint type) /* used only during quotacheck */
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+
+ return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type);
+}
+
/*
* Do some primitive error checking on ondisk dquot data structures.
*/
int
-xfs_dquot_repair(
+xfs_dqblk_repair(
struct xfs_mount *mp,
- struct xfs_disk_dquot *ddq,
+ struct xfs_dqblk *dqb,
xfs_dqid_t id,
uint type)
{
- struct xfs_dqblk *d = (struct xfs_dqblk *)ddq;
-
-
/*
* Typically, a repair is only requested by quotacheck.
*/
ASSERT(id != -1);
- memset(d, 0, sizeof(xfs_dqblk_t));
+ memset(dqb, 0, sizeof(xfs_dqblk_t));
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_flags = type;
- d->dd_diskdq.d_id = cpu_to_be32(id);
+ dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ dqb->dd_diskdq.d_flags = type;
+ dqb->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
- xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -135,7 +152,8 @@ xfs_dquot_repair(
STATIC bool
xfs_dquot_buf_verify_crc(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ bool readahead)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
int ndquots;
@@ -156,10 +174,12 @@ xfs_dquot_buf_verify_crc(
for (i = 0; i < ndquots; i++, d++) {
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
- XFS_DQUOT_CRC_OFF))
- return false;
- if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
+ XFS_DQUOT_CRC_OFF)) {
+ if (!readahead)
+ xfs_buf_verifier_error(bp, -EFSBADCRC, __func__,
+ d, sizeof(*d), __this_address);
return false;
+ }
}
return true;
}
@@ -167,9 +187,10 @@ xfs_dquot_buf_verify_crc(
STATIC xfs_failaddr_t
xfs_dquot_buf_verify(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ bool readahead)
{
- struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ struct xfs_dqblk *dqb = bp->b_addr;
xfs_failaddr_t fa;
xfs_dqid_t id = 0;
int ndquots;
@@ -195,14 +216,19 @@ xfs_dquot_buf_verify(
for (i = 0; i < ndquots; i++) {
struct xfs_disk_dquot *ddq;
- ddq = &d[i].dd_diskdq;
+ ddq = &dqb[i].dd_diskdq;
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0);
- if (fa)
+ fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
+ if (fa) {
+ if (!readahead)
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+ __func__, &dqb[i],
+ sizeof(struct xfs_dqblk), fa);
return fa;
+ }
}
return NULL;
@@ -214,7 +240,7 @@ xfs_dquot_buf_verify_struct(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- return xfs_dquot_buf_verify(mp, bp);
+ return xfs_dquot_buf_verify(mp, bp, false);
}
static void
@@ -222,15 +248,10 @@ xfs_dquot_buf_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_failaddr_t fa;
- if (!xfs_dquot_buf_verify_crc(mp, bp))
- xfs_verifier_error(bp, -EFSBADCRC, __this_address);
- else {
- fa = xfs_dquot_buf_verify(mp, bp);
- if (fa)
- xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
- }
+ if (!xfs_dquot_buf_verify_crc(mp, bp, false))
+ return;
+ xfs_dquot_buf_verify(mp, bp, false);
}
/*
@@ -245,8 +266,8 @@ xfs_dquot_buf_readahead_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if (!xfs_dquot_buf_verify_crc(mp, bp) ||
- xfs_dquot_buf_verify(mp, bp) != NULL) {
+ if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
+ xfs_dquot_buf_verify(mp, bp, true) != NULL) {
xfs_buf_ioerror(bp, -EIO);
bp->b_flags &= ~XBF_DONE;
}
@@ -262,11 +283,8 @@ xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_failaddr_t fa;
- fa = xfs_dquot_buf_verify(mp, bp);
- if (fa)
- xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+ xfs_dquot_buf_verify(mp, bp, false);
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index bc1789d95152..d47b91625945 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -65,7 +65,8 @@
#define XFS_ERRTAG_LOG_BAD_CRC 29
#define XFS_ERRTAG_LOG_ITEM_PIN 30
#define XFS_ERRTAG_BUF_LRU_REF 31
-#define XFS_ERRTAG_MAX 32
+#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
+#define XFS_ERRTAG_MAX 33
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -102,5 +103,6 @@
#define XFS_RANDOM_LOG_BAD_CRC 1
#define XFS_RANDOM_LOG_ITEM_PIN 1
#define XFS_RANDOM_BUF_LRU_REF 2
+#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 42956d8d95ed..c1cb29a5f4f6 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,6 +98,9 @@ struct xfs_ifork;
XFS_SB_VERSION2_PROJID32BIT | \
XFS_SB_VERSION2_FTYPE)
+/* Maximum size of the xfs filesystem label, no terminating NULL */
+#define XFSLABEL_MAX 12
+
/*
* Superblock - in core version. Must match the ondisk version below.
* Must be padded to 64 bit alignment.
@@ -122,7 +125,7 @@ typedef struct xfs_sb {
uint16_t sb_sectsize; /* volume sector size, bytes */
uint16_t sb_inodesize; /* inode size, bytes */
uint16_t sb_inopblock; /* inodes per block */
- char sb_fname[12]; /* file system name */
+ char sb_fname[XFSLABEL_MAX]; /* file system name */
uint8_t sb_blocklog; /* log2 of sb_blocksize */
uint8_t sb_sectlog; /* log2 of sb_sectsize */
uint8_t sb_inodelog; /* log2 of sb_inodesize */
@@ -213,7 +216,7 @@ typedef struct xfs_dsb {
__be16 sb_sectsize; /* volume sector size, bytes */
__be16 sb_inodesize; /* inode size, bytes */
__be16 sb_inopblock; /* inodes per block */
- char sb_fname[12]; /* file system name */
+ char sb_fname[XFSLABEL_MAX]; /* file system name */
__u8 sb_blocklog; /* log2 of sb_blocksize */
__u8 sb_sectlog; /* log2 of sb_sectsize */
__u8 sb_inodelog; /* log2 of sb_inodesize */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index faf1a4edd618..dddc75e4f1f6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -542,13 +542,20 @@ struct xfs_scrub_metadata {
/* o: Metadata object looked funny but isn't corrupt. */
#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+/*
+ * o: IFLAG_REPAIR was set but metadata object did not need fixing or
+ * optimization and has therefore not been altered.
+ */
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
XFS_SCRUB_OFLAG_PREEN | \
XFS_SCRUB_OFLAG_XFAIL | \
XFS_SCRUB_OFLAG_XCORRUPT | \
XFS_SCRUB_OFLAG_INCOMPLETE | \
- XFS_SCRUB_OFLAG_WARNING)
+ XFS_SCRUB_OFLAG_WARNING | \
+ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 0e2cf5f0be1f..4ca4ff7a757d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -148,7 +148,7 @@ xfs_inobt_get_rec(
/*
* Insert a single inobt record. Cursor must already point to desired location.
*/
-STATIC int
+int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
uint16_t holemask,
@@ -2406,7 +2406,7 @@ xfs_ialloc_compute_maxlevels(
uint inodes;
inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
- mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
+ mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr,
inodes);
}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index c5402bb4ce0c..77fffced8bac 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -176,6 +176,9 @@ int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
xfs_agino_t high, bool *exists);
int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
xfs_agino_t *freecount);
+int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
+ uint8_t count, int32_t freecount, xfs_inofree_t free,
+ int *stat);
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index a2dd7f4a2719..b04c55512159 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -296,7 +296,7 @@ xfs_inobt_verify(
case cpu_to_be32(XFS_FIBT_MAGIC):
break;
default:
- return NULL;
+ return __this_address;
}
/* level verification */
@@ -556,7 +556,7 @@ xfs_inobt_max_size(
if (mp->m_inobt_mxr[0] == 0)
return 0;
- return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
+ return xfs_btree_calc_size(mp->m_inobt_mnr,
(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
XFS_INODES_PER_CHUNK);
}
@@ -608,3 +608,12 @@ xfs_finobt_calc_reserves(
*used += tree_len;
return 0;
}
+
+/* Calculate the inobt btree size for some records. */
+xfs_extlen_t
+xfs_iallocbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_inobt_mnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index aa81e2e63f3f..4acdd5458d59 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -74,5 +74,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_extlen_t *ask, xfs_extlen_t *used);
+extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index ef68b1de006a..1201107eabc6 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -466,6 +466,8 @@ xfs_dinode_verify(
return __this_address;
if (di_size > XFS_DFORK_DSIZE(dip, mp))
return __this_address;
+ if (dip->di_nextents)
+ return __this_address;
/* fall through */
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -484,12 +486,31 @@ xfs_dinode_verify(
if (XFS_DFORK_Q(dip)) {
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
+ if (dip->di_anextents)
+ return __this_address;
+ /* fall through */
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
break;
default:
return __this_address;
}
+ } else {
+ /*
+ * If there is no fork offset, this may be a freshly-made inode
+ * in a new disk cluster, in which case di_aformat is zeroed.
+ * Otherwise, such an inode must be in EXTENTS format; this goes
+ * for freed inodes as well.
+ */
+ switch (dip->di_aformat) {
+ case 0:
+ case XFS_DINODE_FMT_EXTENTS:
+ break;
+ default:
+ return __this_address;
+ }
+ if (dip->di_anextents)
+ return __this_address;
}
/* only version 3 or greater inodes are extensively verified here */
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index bb1b13a9b5f4..d4af2804b178 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -107,14 +107,12 @@ typedef uint16_t xfs_qwarncnt_t;
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
-#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
@@ -152,10 +150,11 @@ typedef uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
- struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type,
- uint flags);
+ struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
+extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
+ struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq,
+extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
xfs_dqid_t id, uint type);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index bee68c23d612..418d53295893 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -88,8 +88,25 @@ xfs_refcount_lookup_ge(
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
+/*
+ * Look up the first record equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ XFS_LOOKUP_LE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
/* Convert on-disk record to in-core format. */
-static inline void
+void
xfs_refcount_btrec_to_irec(
union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec)
@@ -149,7 +166,7 @@ xfs_refcount_update(
* by [bno, len, refcount].
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
-STATIC int
+int
xfs_refcount_insert(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec,
@@ -162,7 +179,10 @@ xfs_refcount_insert(
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
error = xfs_btree_insert(cur, i);
+ if (error)
+ goto out_error;
XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+
out_error:
if (error)
trace_xfs_refcount_insert_error(cur->bc_mp,
@@ -351,7 +371,6 @@ xfs_refcount_merge_center_extents(
struct xfs_refcount_irec *center,
struct xfs_refcount_irec *right,
unsigned long long extlen,
- xfs_agblock_t *agbno,
xfs_extlen_t *aglen)
{
int error;
@@ -471,7 +490,6 @@ xfs_refcount_merge_right_extent(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *right,
struct xfs_refcount_irec *cright,
- xfs_agblock_t *agbno,
xfs_extlen_t *aglen)
{
int error;
@@ -749,7 +767,7 @@ xfs_refcount_merge_extents(
ulen < MAXREFCEXTLEN) {
*shape_changed = true;
return xfs_refcount_merge_center_extents(cur, &left, &cleft,
- &right, ulen, agbno, aglen);
+ &right, ulen, aglen);
}
/* Try to merge left and cleft. */
@@ -778,7 +796,7 @@ xfs_refcount_merge_extents(
ulen < MAXREFCEXTLEN) {
*shape_changed = true;
return xfs_refcount_merge_right_extent(cur, &right, &cright,
- agbno, aglen);
+ aglen);
}
return error;
@@ -1356,9 +1374,7 @@ xfs_refcount_adjust_cow_extents(
struct xfs_btree_cur *cur,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_defer_ops *dfops,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
struct xfs_refcount_irec ext, tmp;
int error;
@@ -1437,8 +1453,7 @@ xfs_refcount_adjust_cow(
struct xfs_btree_cur *cur,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_defer_ops *dfops)
+ enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int error;
@@ -1465,8 +1480,7 @@ xfs_refcount_adjust_cow(
goto out_error;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj,
- dfops, NULL);
+ error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj);
if (error)
goto out_error;
@@ -1493,7 +1507,7 @@ __xfs_refcount_cow_alloc(
/* Add refcount btree reservation */
return xfs_refcount_adjust_cow(rcur, agbno, aglen,
- XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
+ XFS_REFCOUNT_ADJUST_COW_ALLOC);
}
/*
@@ -1511,7 +1525,7 @@ __xfs_refcount_cow_free(
/* Remove refcount btree reservation */
return xfs_refcount_adjust_cow(rcur, agbno, aglen,
- XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
+ XFS_REFCOUNT_ADJUST_COW_FREE);
}
/* Record a CoW staging extent in the refcount btree. */
@@ -1568,7 +1582,7 @@ struct xfs_refcount_recovery {
/* Stuff an extent on the recovery list. */
STATIC int
xfs_refcount_recover_extent(
- struct xfs_btree_cur *cur,
+ struct xfs_btree_cur *cur,
union xfs_btree_rec *rec,
void *priv)
{
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 2a731ac68fe4..a92ad9078bc1 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -24,6 +24,8 @@ extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, int *stat);
extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
@@ -85,5 +87,10 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+union xfs_btree_rec;
+extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec,
+ struct xfs_refcount_irec *irec);
+extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec, int *stat);
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 265fdcefcbae..375abfeb6267 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -373,7 +373,6 @@ xfs_refcountbt_init_cursor(
*/
int
xfs_refcountbt_maxrecs(
- struct xfs_mount *mp,
int blocklen,
bool leaf)
{
@@ -390,7 +389,7 @@ void
xfs_refcountbt_compute_maxlevels(
struct xfs_mount *mp)
{
- mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
mp->m_refc_mnr, mp->m_sb.sb_agblocks);
}
@@ -400,7 +399,7 @@ xfs_refcountbt_calc_size(
struct xfs_mount *mp,
unsigned long long len)
{
- return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
+ return xfs_btree_calc_size(mp->m_refc_mnr, len);
}
/*
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index 9db008b955b7..2bc4694ef146 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -60,8 +60,7 @@ struct xfs_mount;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno,
struct xfs_defer_ops *dfops);
-extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
- bool leaf);
+extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 79822cf6ebe3..c0644f1be8a8 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -376,7 +376,6 @@ xfs_rmap_free_check_owner(
struct xfs_mount *mp,
uint64_t ltoff,
struct xfs_rmap_irec *rec,
- xfs_fsblock_t bno,
xfs_filblks_t len,
uint64_t owner,
uint64_t offset,
@@ -519,7 +518,7 @@ xfs_rmap_unmap(
bno + len, out_error);
/* Check owner information. */
- error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, bno, len, owner,
+ error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
offset, flags);
if (error)
goto out_error;
@@ -1375,6 +1374,8 @@ xfs_rmap_convert_shared(
*/
error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
&PREV, &i);
+ if (error)
+ goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
ASSERT(PREV.rm_offset <= offset);
@@ -2031,6 +2032,34 @@ out_error:
return error;
}
+/* Insert a raw rmap into the rmapbt. */
+int
+xfs_rmap_map_raw(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rmap)
+{
+ struct xfs_owner_info oinfo;
+
+ oinfo.oi_owner = rmap->rm_owner;
+ oinfo.oi_offset = rmap->rm_offset;
+ oinfo.oi_flags = 0;
+ if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+ if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return xfs_rmap_map(cur, rmap->rm_startblock,
+ rmap->rm_blockcount,
+ rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+ &oinfo);
+
+ return xfs_rmap_map_shared(cur, rmap->rm_startblock,
+ rmap->rm_blockcount,
+ rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+ &oinfo);
+}
+
struct xfs_rmap_query_range_info {
xfs_rmap_query_range_fn fn;
void *priv;
@@ -2454,3 +2483,56 @@ xfs_rmap_record_exists(
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
}
+
+struct xfs_rmap_key_state {
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool has_rmap;
+};
+
+/* For each rmap given, figure out if it doesn't match the key we want. */
+STATIC int
+xfs_rmap_has_other_keys_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_rmap_key_state *rks = priv;
+
+ if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
+ ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
+ return 0;
+ rks->has_rmap = true;
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Given an extent and some owner info, can we find records overlapping
+ * the extent whose owner info does not match the given owner?
+ */
+int
+xfs_rmap_has_other_keys(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ bool *has_rmap)
+{
+ struct xfs_rmap_irec low = {0};
+ struct xfs_rmap_irec high;
+ struct xfs_rmap_key_state rks;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
+ rks.has_rmap = false;
+
+ low.rm_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno + len - 1;
+
+ error = xfs_rmap_query_range(cur, &low, &high,
+ xfs_rmap_has_other_keys_helper, &rks);
+ *has_rmap = rks.has_rmap;
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 380e53be98d5..43e506f67680 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -238,5 +238,9 @@ int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, struct xfs_owner_info *oinfo,
bool *has_rmap);
+int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, struct xfs_owner_info *oinfo,
+ bool *has_rmap);
+int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 8b0d0de1cd11..d756e0b84abf 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -499,7 +499,6 @@ xfs_rmapbt_init_cursor(
*/
int
xfs_rmapbt_maxrecs(
- struct xfs_mount *mp,
int blocklen,
int leaf)
{
@@ -534,7 +533,7 @@ xfs_rmapbt_compute_maxlevels(
if (xfs_sb_version_hasreflink(&mp->m_sb))
mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
else
- mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
}
@@ -544,7 +543,7 @@ xfs_rmapbt_calc_size(
struct xfs_mount *mp,
unsigned long long len)
{
- return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
+ return xfs_btree_calc_size(mp->m_rmap_mnr, len);
}
/*
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 19c08e933049..d68d96eed7ea 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -55,7 +55,7 @@ struct xfs_mount;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
xfs_agnumber_t agno);
-int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
+int xfs_rmapbt_maxrecs(int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 106be2d0bb88..369eeb7a52ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -90,6 +90,9 @@ xfs_rtbuf_get(
if (error)
return error;
+ if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+ return -EFSCORRUPTED;
+
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
@@ -1033,14 +1036,17 @@ xfs_rtalloc_query_range(
int is_free;
int error = 0;
- if (low_rec->ar_startblock > high_rec->ar_startblock)
+ if (low_rec->ar_startext > high_rec->ar_startext)
return -EINVAL;
- else if (low_rec->ar_startblock == high_rec->ar_startblock)
+ if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
+ low_rec->ar_startext == high_rec->ar_startext)
return 0;
+ if (high_rec->ar_startext >= mp->m_sb.sb_rextents)
+ high_rec->ar_startext = mp->m_sb.sb_rextents - 1;
/* Iterate the bitmap, looking for discrepancies. */
- rtstart = low_rec->ar_startblock;
- rem = high_rec->ar_startblock - rtstart;
+ rtstart = low_rec->ar_startext;
+ rem = high_rec->ar_startext - rtstart;
while (rem) {
/* Is the first block free? */
error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
@@ -1050,13 +1056,13 @@ xfs_rtalloc_query_range(
/* How long does the extent go for? */
error = xfs_rtfind_forw(mp, tp, rtstart,
- high_rec->ar_startblock - 1, &rtend);
+ high_rec->ar_startext - 1, &rtend);
if (error)
break;
if (is_free) {
- rec.ar_startblock = rtstart;
- rec.ar_blockcount = rtend - rtstart + 1;
+ rec.ar_startext = rtstart;
+ rec.ar_extcount = rtend - rtstart + 1;
error = fn(tp, &rec, priv);
if (error)
@@ -1079,9 +1085,9 @@ xfs_rtalloc_query_all(
{
struct xfs_rtalloc_rec keys[2];
- keys[0].ar_startblock = 0;
- keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
- keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+ keys[0].ar_startext = 0;
+ keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[0].ar_extcount = keys[1].ar_extcount = 0;
return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 53433cc024fd..d485e14313c6 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -756,15 +756,13 @@ xfs_sb_mount_common(
mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
- mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1);
+ mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0);
mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
- mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
- true);
- mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
- false);
+ mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true);
+ mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false);
mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
@@ -890,6 +888,109 @@ xfs_sync_sb(
return xfs_trans_commit(tp);
}
+/*
+ * Update all the secondary superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The sb buffers need to be cached here so that we serialise against other
+ * operations that access the secondary superblocks, but we don't want to keep
+ * them in memory once it is written so we mark it as a one-shot buffer.
+ */
+int
+xfs_update_secondary_sbs(
+ struct xfs_mount *mp)
+{
+ xfs_agnumber_t agno;
+ int saved_error = 0;
+ int error = 0;
+ LIST_HEAD (buffer_list);
+
+ /* update secondary superblocks. */
+ for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+ XFS_FSS_TO_BB(mp, 1), 0);
+ /*
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
+ */
+ if (!bp) {
+ xfs_warn(mp,
+ "error allocating secondary superblock for ag %d",
+ agno);
+ if (!saved_error)
+ saved_error = -ENOMEM;
+ continue;
+ }
+
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_oneshot(bp);
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_buf_delwri_queue(bp, &buffer_list);
+ xfs_buf_relse(bp);
+
+ /* don't hold too many buffers at once */
+ if (agno % 16)
+ continue;
+
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error) {
+ xfs_warn(mp,
+ "write error %d updating a secondary superblock near ag %d",
+ error, agno);
+ if (!saved_error)
+ saved_error = error;
+ continue;
+ }
+ }
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error) {
+ xfs_warn(mp,
+ "write error %d updating a secondary superblock near ag %d",
+ error, agno);
+ }
+
+ return saved_error ? saved_error : error;
+}
+
+/*
+ * Same behavior as xfs_sync_sb, except that it is always synchronous and it
+ * also writes the superblock buffer to disk sector 0 immediately.
+ */
+int
+xfs_sync_sb_buf(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_log_sb(tp);
+ xfs_trans_bhold(tp, mp->m_sb_bp);
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+ /*
+ * write out the sb buffer to get the changes to disk
+ */
+ error = xfs_bwrite(mp->m_sb_bp);
+out:
+ xfs_buf_relse(mp->m_sb_bp);
+ return error;
+}
+
int
xfs_fs_geometry(
struct xfs_sb *sbp,
@@ -974,3 +1075,47 @@ xfs_fs_geometry(
return 0;
}
+
+/* Read a secondary superblock. */
+int
+xfs_sb_read_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (error)
+ return error;
+ xfs_buf_set_ref(bp, XFS_SSB_REF);
+ *bpp = bp;
+ return 0;
+}
+
+/* Get an uninitialised secondary superblock buffer. */
+int
+xfs_sb_get_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0);
+ if (!bp)
+ return -ENOMEM;
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_oneshot(bp);
+ *bpp = bp;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 63dcd2a1a657..244e0162c49e 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -18,6 +18,13 @@
#ifndef __XFS_SB_H__
#define __XFS_SB_H__
+struct xfs_mount;
+struct xfs_sb;
+struct xfs_dsb;
+struct xfs_trans;
+struct xfs_fsop_geom;
+struct xfs_perag;
+
/*
* perag get/put wrappers for ref counting
*/
@@ -29,13 +36,22 @@ extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
+extern int xfs_sync_sb_buf(struct xfs_mount *mp);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
+
#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
int struct_version);
+extern int xfs_sb_read_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
+extern int xfs_sb_get_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index d0b84da0cb1e..ae99c260adb1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -57,21 +57,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
-/*
- * This structure is used to track log items associated with
- * a transaction. It points to the log item and keeps some
- * flags to track the state of the log item. It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
- struct xfs_log_item *lid_item;
- struct list_head lid_trans;
- unsigned char lid_flags;
-};
-
-#define XFS_LID_DIRTY 0x1
-
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -127,6 +112,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_ATTR_BTREE_REF 1
#define XFS_DQUOT_REF 1
#define XFS_REFC_BTREE_REF 1
+#define XFS_SSB_REF 0
/*
* Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5f17641f040f..3bccdf73e141 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -734,8 +734,7 @@ xfs_calc_clear_agi_bucket_reservation(
* the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
*/
STATIC uint
-xfs_calc_qm_setqlim_reservation(
- struct xfs_mount *mp)
+xfs_calc_qm_setqlim_reservation(void)
{
return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
}
@@ -772,8 +771,7 @@ xfs_calc_qm_quotaoff_reservation(
* the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
*/
STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(
- struct xfs_mount *mp)
+xfs_calc_qm_quotaoff_end_reservation(void)
{
return sizeof(struct xfs_qoff_logitem) * 2;
}
@@ -877,14 +875,14 @@ xfs_trans_resv_calc(
* The following transactions are logged in logical format with
* a default log count.
*/
- resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+ resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation();
resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
resp->tr_qm_equotaoff.tr_logres =
- xfs_calc_qm_quotaoff_end_reservation(mp);
+ xfs_calc_qm_quotaoff_end_reservation();
resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3c560695c546..ea18449bd732 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -30,7 +30,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
-typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */
+typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
typedef int32_t xfs_tid_t; /* transaction identifier */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 018aabbd9394..1f71793f7db4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -38,68 +38,6 @@
#include "scrub/common.h"
#include "scrub/trace.h"
-/*
- * Walk all the blocks in the AGFL. The fn function can return any negative
- * error code or XFS_BTREE_QUERY_RANGE_ABORT.
- */
-int
-xfs_scrub_walk_agfl(
- struct xfs_scrub_context *sc,
- int (*fn)(struct xfs_scrub_context *,
- xfs_agblock_t bno, void *),
- void *priv)
-{
- struct xfs_agf *agf;
- __be32 *agfl_bno;
- struct xfs_mount *mp = sc->mp;
- unsigned int flfirst;
- unsigned int fllast;
- int i;
- int error;
-
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp);
- flfirst = be32_to_cpu(agf->agf_flfirst);
- fllast = be32_to_cpu(agf->agf_fllast);
-
- /* Nothing to walk in an empty AGFL. */
- if (agf->agf_flcount == cpu_to_be32(0))
- return 0;
-
- /* first to last is a consecutive list. */
- if (fllast >= flfirst) {
- for (i = flfirst; i <= fllast; i++) {
- error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
- if (error)
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
- }
-
- return 0;
- }
-
- /* first to the end */
- for (i = flfirst; i < xfs_agfl_size(mp); i++) {
- error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
- if (error)
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
- }
-
- /* the start to last. */
- for (i = 0; i <= fllast; i++) {
- error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
- if (error)
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
- }
-
- return 0;
-}
-
/* Superblock */
/* Cross-reference with the other btrees. */
@@ -157,9 +95,7 @@ xfs_scrub_superblock(
if (agno == 0)
return 0;
- error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp);
/*
* The superblock verifier can return several different error codes
* if it thinks the superblock doesn't look right. For a mount these
@@ -680,6 +616,7 @@ struct xfs_scrub_agfl_info {
unsigned int sz_entries;
unsigned int nr_entries;
xfs_agblock_t *entries;
+ struct xfs_scrub_context *sc;
};
/* Cross-reference with the other btrees. */
@@ -701,12 +638,12 @@ xfs_scrub_agfl_block_xref(
/* Scrub an AGFL block. */
STATIC int
xfs_scrub_agfl_block(
- struct xfs_scrub_context *sc,
+ struct xfs_mount *mp,
xfs_agblock_t agbno,
void *priv)
{
- struct xfs_mount *mp = sc->mp;
struct xfs_scrub_agfl_info *sai = priv;
+ struct xfs_scrub_context *sc = sai->sc;
xfs_agnumber_t agno = sc->sa.agno;
if (xfs_verify_agbno(mp, agno, agbno) &&
@@ -717,6 +654,9 @@ xfs_scrub_agfl_block(
xfs_scrub_agfl_block_xref(sc, agbno, priv);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
return 0;
}
@@ -796,8 +736,10 @@ xfs_scrub_agfl(
goto out;
}
memset(&sai, 0, sizeof(sai));
+ sai.sc = sc;
sai.sz_entries = agflcount;
- sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
+ sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
+ KM_MAYFAIL);
if (!sai.entries) {
error = -ENOMEM;
goto out;
@@ -805,7 +747,12 @@ xfs_scrub_agfl(
/* Check the blocks in the AGFL. */
xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
- error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai);
+ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ sc->sa.agfl_bp, xfs_scrub_agfl_block, &sai);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ error = 0;
+ goto out_free;
+ }
if (error)
goto out_free;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
new file mode 100644
index 000000000000..8b91e9ebe1e7
--- /dev/null
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Superblock */
+
+/* Repair the superblock. */
+int
+xfs_repair_superblock(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+ int error;
+
+ /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
+ agno = sc->sm->sm_agno;
+ if (agno == 0)
+ return -EOPNOTSUPP;
+
+ error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
+ if (error)
+ return error;
+
+ /* Copy AG 0's superblock to this one. */
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+
+ /* Write this to disk. */
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return error;
+}
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 517c079d3f68..941a0a55224e 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -70,7 +70,7 @@ xfs_scrub_allocbt_xref_other(
pcur = &sc->sa.cnt_cur;
else
pcur = &sc->sa.bno_cur;
- if (!*pcur)
+ if (!*pcur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec);
@@ -172,7 +172,7 @@ xfs_scrub_xref_is_used_space(
bool is_freesp;
int error;
- if (!sc->sa.bno_cur)
+ if (!sc->sa.bno_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 127575f0abfb..84b6d6b66578 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -126,8 +126,9 @@ xfs_scrub_xattr_listent(
if (args.valuelen != valuelen)
xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
args.blkno);
-
fail_xref:
+ if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ context->seen_enough = 1;
return;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 639d14b51e90..eeadb33a701c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -51,7 +51,6 @@ xfs_scrub_setup_inode_bmap(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = sc->mp;
int error;
error = xfs_scrub_get_inode(sc, ip);
@@ -75,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
}
/* Got the inode, lock it and we're ready to go. */
- error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
@@ -175,7 +174,7 @@ xfs_scrub_bmap_xref_rmap(
unsigned long long rmap_end;
uint64_t owner;
- if (!info->sc->sa.rmap_cur)
+ if (!info->sc->sa.rmap_cur || xfs_scrub_skip_xref(info->sc->sm))
return;
if (info->whichfork == XFS_COW_FORK)
@@ -684,7 +683,8 @@ xfs_scrub_bmap(
info.lastoff = 0;
ifp = XFS_IFORK_PTR(ip, whichfork);
for_each_xfs_iext(ifp, &icur, &irec) {
- if (xfs_scrub_should_terminate(sc, &error))
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
if (isnullstartblock(irec.br_startblock))
continue;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 54218168c8f9..2d29dceaa00e 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -442,7 +442,7 @@ xfs_scrub_btree_check_owner(
*/
if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
co = kmem_alloc(sizeof(struct check_owner),
- KM_MAYFAIL | KM_NOFS);
+ KM_MAYFAIL);
if (!co)
return -ENOMEM;
co->level = level;
@@ -455,6 +455,44 @@ xfs_scrub_btree_check_owner(
}
/*
+ * Check that this btree block has at least minrecs records or is one of the
+ * special blocks that don't require that.
+ */
+STATIC void
+xfs_scrub_btree_check_minrecs(
+ struct xfs_scrub_btree *bs,
+ int level,
+ struct xfs_btree_block *block)
+{
+ unsigned int numrecs;
+ int ok_level;
+
+ numrecs = be16_to_cpu(block->bb_numrecs);
+
+ /* More records than minrecs means the block is ok. */
+ if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level))
+ return;
+
+ /*
+ * Certain btree blocks /can/ have fewer than minrecs records. Any
+ * level greater than or equal to the level of the highest dedicated
+ * btree block are allowed to violate this constraint.
+ *
+ * For a btree rooted in a block, the btree root can have fewer than
+ * minrecs records. If the btree is rooted in an inode and does not
+ * store records in the root, the direct children of the root and the
+ * root itself can have fewer than minrecs records.
+ */
+ ok_level = bs->cur->bc_nlevels - 1;
+ if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ ok_level--;
+ if (level >= ok_level)
+ return;
+
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+}
+
+/*
* Grab and scrub a btree block given a btree pointer. Returns block
* and buffer pointers (if applicable) if they're ok to use.
*/
@@ -491,6 +529,8 @@ xfs_scrub_btree_get_block(
if (*pbp)
xfs_scrub_buffer_recheck(bs->sc, *pbp);
+ xfs_scrub_btree_check_minrecs(bs, level, *pblock);
+
/*
* Check the block's owner; this function absorbs error codes
* for us.
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8ed91d5c868d..41198a5f872c 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -44,11 +44,14 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/btree.h"
+#include "scrub/repair.h"
/* Common code for the metadata scrubbers. */
@@ -539,6 +542,10 @@ xfs_scrub_ag_free(
xfs_trans_brelse(sc->tp, sa->agi_bp);
sa->agi_bp = NULL;
}
+ if (sa->pag) {
+ xfs_perag_put(sa->pag);
+ sa->pag = NULL;
+ }
sa->agno = NULLAGNUMBER;
}
@@ -566,15 +573,53 @@ xfs_scrub_ag_init(
return xfs_scrub_ag_btcur_init(sc, sa);
}
+/*
+ * Grab the per-ag structure if we haven't already gotten it. Teardown of the
+ * xfs_scrub_ag will release it for us.
+ */
+void
+xfs_scrub_perag_get(
+ struct xfs_mount *mp,
+ struct xfs_scrub_ag *sa)
+{
+ if (!sa->pag)
+ sa->pag = xfs_perag_get(mp, sa->agno);
+}
+
/* Per-scrubber setup functions */
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest possible
+ * log reservation so that we can handle the worst case scenario for metadata
+ * updates while rebuilding a metadata item. We also need to reserve as many
+ * blocks in the head transaction as we think we're going to need to rebuild
+ * the metadata object.
+ */
+int
+xfs_scrub_trans_alloc(
+ struct xfs_scrub_context *sc,
+ uint resblks)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ resblks, 0, 0, &sc->tp);
+
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
/* Set us up with a transaction and an empty context. */
int
xfs_scrub_setup_fs(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+ uint resblks;
+
+ resblks = xfs_repair_calc_ag_resblks(sc);
+ return xfs_scrub_trans_alloc(sc, resblks);
}
/* Set us up with AG headers and btree cursors. */
@@ -695,7 +740,6 @@ xfs_scrub_setup_inode_contents(
struct xfs_inode *ip,
unsigned int resblks)
{
- struct xfs_mount *mp = sc->mp;
int error;
error = xfs_scrub_get_inode(sc, ip);
@@ -705,7 +749,7 @@ xfs_scrub_setup_inode_contents(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ error = xfs_scrub_trans_alloc(sc, resblks);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
@@ -727,6 +771,10 @@ xfs_scrub_should_check_xref(
int *error,
struct xfs_btree_cur **curpp)
{
+ /* No point in xref if we already know we're corrupt. */
+ if (xfs_scrub_skip_xref(sc->sm))
+ return false;
+
if (*error == 0)
return true;
@@ -773,3 +821,80 @@ xfs_scrub_buffer_recheck(
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
trace_xfs_scrub_block_error(sc, bp->b_bn, fa);
}
+
+/*
+ * Scrub the attr/data forks of a metadata inode. The metadata inode must be
+ * pointed to by sc->ip and the ILOCK must be held.
+ */
+int
+xfs_scrub_metadata_inode_forks(
+ struct xfs_scrub_context *sc)
+{
+ __u32 smtype;
+ bool shared;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Metadata inodes don't live on the rt device. */
+ if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ /* They should never participate in reflink. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ /* They also should never have extended attributes. */
+ if (xfs_inode_hasattr(sc->ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ /* Invoke the data fork scrubber. */
+ smtype = sc->sm->sm_type;
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
+ error = xfs_scrub_bmap_data(sc);
+ sc->sm->sm_type = smtype;
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ /* Look for incorrect shared blocks. */
+ if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+ error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+ &shared);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+ if (shared)
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+ return error;
+}
+
+/*
+ * Try to lock an inode in violation of the usual locking order rules. For
+ * example, trying to get the IOLOCK while in transaction context, or just
+ * plain breaking AG-order or inode-order inode locking rules. Either way,
+ * the only way to avoid an ABBA deadlock is to use trylock and back off if
+ * we can't.
+ */
+int
+xfs_scrub_ilock_inverted(
+ struct xfs_inode *ip,
+ uint lock_mode)
+{
+ int i;
+
+ for (i = 0; i < 20; i++) {
+ if (xfs_ilock_nowait(ip, lock_mode))
+ return 0;
+ delay(1);
+ }
+ return -EDEADLOCK;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index deaf60400981..76bb2d1d808c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -38,19 +38,7 @@ xfs_scrub_should_terminate(
return false;
}
-/*
- * Grab an empty transaction so that we can re-grab locked buffers if
- * one of our btrees turns out to be cyclic.
- */
-static inline int
-xfs_scrub_trans_alloc(
- struct xfs_scrub_metadata *sm,
- struct xfs_mount *mp,
- struct xfs_trans **tpp)
-{
- return xfs_trans_alloc_empty(mp, tpp);
-}
-
+int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks);
bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
@@ -135,16 +123,13 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
struct xfs_scrub_ag *sa);
+void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa);
int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
struct xfs_buf **agi, struct xfs_buf **agf,
struct xfs_buf **agfl);
void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa);
int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
struct xfs_scrub_ag *sa);
-int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
- int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno,
- void *),
- void *priv);
int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc,
struct xfs_btree_cur *cur,
struct xfs_owner_info *oinfo,
@@ -157,4 +142,17 @@ int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc,
struct xfs_inode *ip, unsigned int resblks);
void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp);
+/*
+ * Don't bother cross-referencing if we already found corruption or cross
+ * referencing discrepancies.
+ */
+static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
+{
+ return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT);
+}
+
+int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc);
+int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 38f29806eb54..1a4309b3e786 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -172,7 +172,7 @@ xfs_scrub_dir_actor(
error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
&error))
- goto fail_xref;
+ goto out;
if (lookup_ino != ino) {
xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
goto out;
@@ -183,8 +183,13 @@ xfs_scrub_dir_actor(
if (error)
goto out;
out:
- return error;
-fail_xref:
+ /*
+ * A negative error code returned here is supposed to cause the
+ * dir_emit caller (xfs_readdir) to abort the directory iteration
+ * and return zero to xfs_scrub_directory.
+ */
+ if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -EFSCORRUPTED;
return error;
}
@@ -240,6 +245,9 @@ xfs_scrub_dir_rec(
}
xfs_scrub_buffer_recheck(ds->sc, bp);
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_relse;
+
dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
/* Make sure we got a real directory entry. */
@@ -357,6 +365,9 @@ xfs_scrub_directory_data_bestfree(
/* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_buf;
+
/* Do the bestfrees correspond to actual free space? */
bf = d_ops->data_bestfree_p(bp->b_addr);
smallest_bestfree = UINT_MAX;
@@ -413,14 +424,18 @@ xfs_scrub_directory_data_bestfree(
/* Spot check this free entry */
tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
- if (tag != ((char *)dup - (char *)bp->b_addr))
+ if (tag != ((char *)dup - (char *)bp->b_addr)) {
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
/*
* Either this entry is a bestfree or it's smaller than
* any of the bestfrees.
*/
xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_buf;
/* Move on. */
newlen = be16_to_cpu(dup->length);
@@ -546,6 +561,8 @@ xfs_scrub_directory_leaf1_bestfree(
}
if (leafhdr.stale != stale)
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
@@ -556,9 +573,11 @@ xfs_scrub_directory_leaf1_bestfree(
i * args->geo->fsbcount, -1, &dbp);
if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- continue;
+ break;
xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
}
out:
return error;
@@ -607,7 +626,7 @@ xfs_scrub_directory_free_bestfree(
-1, &dbp);
if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- continue;
+ break;
xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
}
@@ -656,7 +675,7 @@ xfs_scrub_directory_blocks(
/* Iterate all the data extents in the directory... */
found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
- while (found) {
+ while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
/* Block directories only have a single block at offset 0. */
if (is_block &&
(got.br_startoff > 0 ||
@@ -719,7 +738,7 @@ xfs_scrub_directory_blocks(
/* Scan for free blocks */
lblk = free_lblk;
found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
- while (found) {
+ while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
/*
* Dirs can't have blocks mapped above 2^32.
* Single-block dirs shouldn't even be here.
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 106ca4bd753f..00a834d3b56d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -387,7 +387,8 @@ xfs_scrub_iallocbt_xref_rmap_btreeblks(
int error;
if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
- (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur))
+ (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) ||
+ xfs_scrub_skip_xref(sc->sm))
return;
/* Check that we saw as many inobt blocks as the rmap says. */
@@ -424,7 +425,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes(
xfs_filblks_t blocks;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Check that we saw as many inode blocks as the rmap knows about. */
@@ -496,7 +497,7 @@ xfs_scrub_xref_inode_check(
bool has_inodes;
int error;
- if (!(*icur))
+ if (!(*icur) || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index df14930e4fc5..0c696f7018de 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -55,7 +55,6 @@ xfs_scrub_setup_inode(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = sc->mp;
int error;
/*
@@ -68,7 +67,7 @@ xfs_scrub_setup_inode(
break;
case -EFSCORRUPTED:
case -EFSBADCRC:
- return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ return xfs_scrub_trans_alloc(sc, 0);
default:
return error;
}
@@ -76,7 +75,7 @@ xfs_scrub_setup_inode(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
@@ -449,7 +448,7 @@ xfs_scrub_inode_xref_finobt(
int has_record;
int error;
- if (!sc->sa.fino_cur)
+ if (!sc->sa.fino_cur || xfs_scrub_skip_xref(sc->sm))
return;
agino = XFS_INO_TO_AGINO(sc->mp, ino);
@@ -492,6 +491,9 @@ xfs_scrub_inode_xref_bmap(
xfs_filblks_t acount;
int error;
+ if (xfs_scrub_skip_xref(sc->sm))
+ return;
+
/* Walk all the extents to check nextents/naextents/nblocks. */
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
&nextents, &count);
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 1fb88c18d455..77c6b22c6bfd 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -147,6 +147,9 @@ xfs_scrub_parent_validate(
*try_again = false;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
/* '..' must not point to ourselves. */
if (sc->ip->i_ino == dnum) {
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
@@ -211,7 +214,9 @@ xfs_scrub_parent_validate(
*/
xfs_iunlock(sc->ip, sc->ilock_flags);
sc->ilock_flags = 0;
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
+ error = xfs_scrub_ilock_inverted(dp, XFS_IOLOCK_SHARED);
+ if (error)
+ goto out_rele;
/* Go looking for our dentry. */
error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
@@ -220,8 +225,10 @@ xfs_scrub_parent_validate(
/* Drop the parent lock, relock this inode. */
xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ error = xfs_scrub_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL);
+ if (error)
+ goto out_rele;
sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
/*
* If we're an unlinked directory, the parent /won't/ have a link
@@ -323,5 +330,13 @@ xfs_scrub_parent(
if (try_again && tries == 20)
xfs_scrub_set_incomplete(sc);
out:
+ /*
+ * If we failed to lock the parent inode even after a retry, just mark
+ * this scrub incomplete and return.
+ */
+ if (sc->try_harder && error == -EDEADLOCK) {
+ error = 0;
+ xfs_scrub_set_incomplete(sc);
+ }
return error;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 6ba465e6c885..15ae4d23d6ac 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -66,25 +66,43 @@ xfs_scrub_setup_quota(
struct xfs_inode *ip)
{
uint dqtype;
+ int error;
+
+ if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
+ return -ENOENT;
dqtype = xfs_scrub_quota_to_dqtype(sc);
if (dqtype == 0)
return -EINVAL;
+ sc->has_quotaofflock = true;
+ mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+ sc->ip = xfs_quota_inode(sc->mp, dqtype);
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ sc->ilock_flags = XFS_ILOCK_EXCL;
return 0;
}
/* Quotas. */
+struct xfs_scrub_quota_info {
+ struct xfs_scrub_context *sc;
+ xfs_dqid_t last_id;
+};
+
/* Scrub the fields in an individual quota item. */
-STATIC void
+STATIC int
xfs_scrub_quota_item(
- struct xfs_scrub_context *sc,
- uint dqtype,
struct xfs_dquot *dq,
- xfs_dqid_t id)
+ uint dqtype,
+ void *priv)
{
+ struct xfs_scrub_quota_info *sqi = priv;
+ struct xfs_scrub_context *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_disk_dquot *d = &dq->q_core;
struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -99,17 +117,18 @@ xfs_scrub_quota_item(
unsigned long long icount;
unsigned long long rcount;
xfs_ino_t fs_icount;
-
- offset = id / qi->qi_dqperchunk;
+ xfs_dqid_t id = be32_to_cpu(d->d_id);
/*
- * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
- * that the actual dquot we got must either have the same id or
- * the next higher id.
+ * Except for the root dquot, the actual dquot we got must either have
+ * the same or higher id as we saw before.
*/
- if (id > be32_to_cpu(d->d_id))
+ offset = id / qi->qi_dqperchunk;
+ if (id && id <= sqi->last_id)
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ sqi->last_id = id;
+
/* Did we get the dquot type we wanted? */
if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
@@ -183,115 +202,85 @@ xfs_scrub_quota_item(
xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
if (id != 0 && rhard != 0 && rcount > rhard)
xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+
+ return 0;
}
-/* Scrub all of a quota type's items. */
-int
-xfs_scrub_quota(
+/* Check the quota's data fork. */
+STATIC int
+xfs_scrub_quota_data_fork(
struct xfs_scrub_context *sc)
{
struct xfs_bmbt_irec irec = { 0 };
- struct xfs_mount *mp = sc->mp;
- struct xfs_inode *ip;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct xfs_dquot *dq;
+ struct xfs_iext_cursor icur;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp;
xfs_fileoff_t max_dqid_off;
- xfs_fileoff_t off = 0;
- xfs_dqid_t id = 0;
- uint dqtype;
- int nimaps;
int error = 0;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
- return -ENOENT;
-
- mutex_lock(&qi->qi_quotaofflock);
- dqtype = xfs_scrub_quota_to_dqtype(sc);
- if (!xfs_this_quota_on(sc->mp, dqtype)) {
- error = -ENOENT;
- goto out_unlock_quota;
- }
-
- /* Attach to the quota inode and set sc->ip so that reporting works. */
- ip = xfs_quota_inode(sc->mp, dqtype);
- sc->ip = ip;
+ /* Invoke the fork scrubber. */
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
- /* Look for problem extents. */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
- xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
- goto out_unlock_inode;
- }
+ /* Check for data fork problems that apply only to quota files. */
max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
- while (1) {
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ for_each_xfs_iext(ifp, &icur, &irec) {
if (xfs_scrub_should_terminate(sc, &error))
break;
-
- off = irec.br_startoff + irec.br_blockcount;
- nimaps = 1;
- error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps,
- XFS_BMAPI_ENTIRE);
- if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
- &error))
- goto out_unlock_inode;
- if (!nimaps)
- break;
- if (irec.br_startblock == HOLESTARTBLOCK)
- continue;
-
- /* Check the extent record doesn't point to crap. */
- if (irec.br_startblock + irec.br_blockcount <=
- irec.br_startblock)
- xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
- irec.br_startoff);
- if (!xfs_verify_fsbno(mp, irec.br_startblock) ||
- !xfs_verify_fsbno(mp, irec.br_startblock +
- irec.br_blockcount - 1))
- xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
- irec.br_startoff);
-
/*
- * Unwritten extents or blocks mapped above the highest
+ * delalloc extents or blocks mapped above the highest
* quota id shouldn't happen.
*/
if (isnullstartblock(irec.br_startblock) ||
irec.br_startoff > max_dqid_off ||
- irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
- xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+ break;
+ }
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
- /* Check all the quota items. */
- while (id < ((xfs_dqid_t)-1ULL)) {
- if (xfs_scrub_should_terminate(sc, &error))
- break;
+ return error;
+}
- error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT,
- &dq);
- if (error == -ENOENT)
- break;
- if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
- id * qi->qi_dqperchunk, &error))
- break;
+/* Scrub all of a quota type's items. */
+int
+xfs_scrub_quota(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_quota_info sqi;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ uint dqtype;
+ int error = 0;
- xfs_scrub_quota_item(sc, dqtype, dq, id);
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
- id = be32_to_cpu(dq->q_core.d_id) + 1;
- xfs_qm_dqput(dq);
- if (!id)
- break;
- }
+ /* Look for problem extents. */
+ error = xfs_scrub_quota_data_fork(sc);
+ if (error)
+ goto out;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /*
+ * Check all the quota items. Now that we've checked the quota inode
+ * data fork we have to drop ILOCK_EXCL to use the regular dquot
+ * functions.
+ */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = 0;
+ sqi.sc = sc;
+ sqi.last_id = 0;
+ error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi);
+ sc->ilock_flags = XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
+ sqi.last_id * qi->qi_dqperchunk, &error))
+ goto out;
out:
- /* We set sc->ip earlier, so make sure we clear it now. */
- sc->ip = NULL;
-out_unlock_quota:
- mutex_unlock(&qi->qi_quotaofflock);
return error;
-
-out_unlock_inode:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out;
}
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 400f1561cd3d..324a5f159145 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -150,7 +150,7 @@ xfs_scrub_refcountbt_rmap_check(
* so we don't need insertion sort here.
*/
frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag),
- KM_MAYFAIL | KM_NOFS);
+ KM_MAYFAIL);
if (!frag)
return -ENOMEM;
memcpy(&frag->rm, rec, sizeof(frag->rm));
@@ -310,7 +310,7 @@ xfs_scrub_refcountbt_xref_rmap(
struct xfs_scrub_refcnt_frag *n;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Cross-reference with the rmapbt to confirm the refcount. */
@@ -404,7 +404,7 @@ xfs_scrub_refcount_xref_rmap(
xfs_filblks_t blocks;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Check that we saw as many refcbt blocks as the rmap knows about. */
@@ -460,7 +460,7 @@ xfs_scrub_xref_is_cow_staging(
int has_refcount;
int error;
- if (!sc->sa.refc_cur)
+ if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Find the CoW staging extent. */
@@ -504,7 +504,7 @@ xfs_scrub_xref_is_not_shared(
bool shared;
int error;
- if (!sc->sa.refc_cur)
+ if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
new file mode 100644
index 000000000000..e3e8fba1c99c
--- /dev/null
+++ b/fs/xfs/scrub/repair.c
@@ -0,0 +1,1089 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_quota.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Attempt to repair some metadata, if the metadata is corrupt and userspace
+ * told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
+ * and will set *fixed to true if it thinks it repaired anything.
+ */
+int
+xfs_repair_attempt(
+ struct xfs_inode *ip,
+ struct xfs_scrub_context *sc,
+ bool *fixed)
+{
+ int error = 0;
+
+ trace_xfs_repair_attempt(ip, sc->sm, error);
+
+ xfs_scrub_ag_btcur_free(&sc->sa);
+
+ /* Repair whatever's broken. */
+ ASSERT(sc->ops->repair);
+ error = sc->ops->repair(sc);
+ trace_xfs_repair_done(ip, sc->sm, error);
+ switch (error) {
+ case 0:
+ /*
+ * Repair succeeded. Commit the fixes and perform a second
+ * scrub so that we can tell userspace if we fixed the problem.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ *fixed = true;
+ return -EAGAIN;
+ case -EDEADLOCK:
+ case -EAGAIN:
+ /* Tell the caller to try again having grabbed all the locks. */
+ if (!sc->try_harder) {
+ sc->try_harder = true;
+ return -EAGAIN;
+ }
+ /*
+ * We tried harder but still couldn't grab all the resources
+ * we needed to fix it. The corruption has not been fixed,
+ * so report back to userspace.
+ */
+ return -EFSCORRUPTED;
+ default:
+ return error;
+ }
+}
+
+/*
+ * Complain about unfixable problems in the filesystem. We don't log
+ * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
+ * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
+ * administrator isn't running xfs_scrub in no-repairs mode.
+ *
+ * Use this helper function because _ratelimited silently declares a static
+ * structure to track rate limiting information.
+ */
+void
+xfs_repair_failure(
+ struct xfs_mount *mp)
+{
+ xfs_alert_ratelimited(mp,
+"Corruption not fixed during online repair. Unmount and run xfs_repair.");
+}
+
+/*
+ * Repair probe -- userspace uses this to probe if we're willing to repair a
+ * given mountpoint.
+ */
+int
+xfs_repair_probe(
+ struct xfs_scrub_context *sc)
+{
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(sc, &error))
+ return error;
+
+ return 0;
+}
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xfs_repair_roll_ag_trans(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ /* Keep the AG header buffers locked so we can keep going. */
+ xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+ /* Roll the transaction. */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out_release;
+
+ /* Join AG headers to the new transaction. */
+ xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+
+ return 0;
+
+out_release:
+ /*
+ * Rolling failed, so release the hold on the buffers. The
+ * buffers will be released during teardown on our way out
+ * of the kernel.
+ */
+ xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+
+ return error;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree? Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xfs_repair_ag_has_space(
+ struct xfs_perag *pag,
+ xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type)
+{
+ return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
+ !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+ pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/*
+ * Figure out how many blocks to reserve for an AG repair. We calculate the
+ * worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-AG btree.
+ */
+xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_metadata *sm = sc->sm;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
+ xfs_agino_t icount = 0;
+ xfs_extlen_t aglen = 0;
+ xfs_extlen_t usedlen;
+ xfs_extlen_t freelen;
+ xfs_extlen_t bnobt_sz;
+ xfs_extlen_t inobt_sz;
+ xfs_extlen_t rmapbt_sz;
+ xfs_extlen_t refcbt_sz;
+ int error;
+
+ if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return 0;
+
+ /* Use in-core counters if possible. */
+ pag = xfs_perag_get(mp, sm->sm_agno);
+ if (pag->pagi_init)
+ icount = pag->pagi_count;
+
+ /*
+ * Otherwise try to get the actual counters from disk; if not, make
+ * some worst case assumptions.
+ */
+ if (icount == 0) {
+ error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ if (error) {
+ icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
+ } else {
+ icount = pag->pagi_count;
+ xfs_buf_relse(bp);
+ }
+ }
+
+ /* Now grab the block counters from the AGF. */
+ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+ if (error) {
+ aglen = mp->m_sb.sb_agblocks;
+ freelen = aglen;
+ usedlen = aglen;
+ } else {
+ aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
+ freelen = pag->pagf_freeblks;
+ usedlen = aglen - freelen;
+ xfs_buf_relse(bp);
+ }
+ xfs_perag_put(pag);
+
+ trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+ freelen, usedlen);
+
+ /*
+ * Figure out how many blocks we'd need worst case to rebuild
+ * each type of btree. Note that we can only rebuild the
+ * bnobt/cntbt or inobt/finobt as pairs.
+ */
+ bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_HOLEMASK_BIT);
+ else
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_CHUNK);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ inobt_sz *= 2;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+ else
+ refcbt_sz = 0;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ /*
+ * Guess how many blocks we need to rebuild the rmapbt.
+ * For non-reflink filesystems we can't have more records than
+ * used blocks. However, with reflink it's possible to have
+ * more than one rmap record per AG block. We don't know how
+ * many rmaps there could be in the AG, so we start off with
+ * what we hope is an generous over-estimation.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ rmapbt_sz = xfs_rmapbt_calc_size(mp,
+ (unsigned long long)aglen * 2);
+ else
+ rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+ } else {
+ rmapbt_sz = 0;
+ }
+
+ trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+ inobt_sz, rmapbt_sz, refcbt_sz);
+
+ return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
+
+/* Allocate a block in an AG. */
+int
+xfs_repair_alloc_ag_block(
+ struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo,
+ xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_alloc_arg args = {0};
+ xfs_agblock_t bno;
+ int error;
+
+ switch (resv) {
+ case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_RMAPBT:
+ error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+ if (error)
+ return error;
+ if (bno == NULLAGBLOCK)
+ return -ENOSPC;
+ xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
+ 1, false);
+ *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+ if (resv == XFS_AG_RESV_RMAPBT)
+ xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
+ return 0;
+ default:
+ break;
+ }
+
+ args.tp = sc->tp;
+ args.mp = sc->mp;
+ args.oinfo = *oinfo;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_THIS_AG;
+ args.resv = resv;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+ ASSERT(args.len == 1);
+ *fsbno = args.fsbno;
+
+ return 0;
+}
+
+/* Initialize a new AG btree root block with zero entries. */
+int
+xfs_repair_init_btblock(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsb,
+ struct xfs_buf **bpp,
+ xfs_btnum_t btnum,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), btnum);
+
+ ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
+ XFS_FSB_TO_BB(mp, 1), 0);
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+ bp->b_ops = ops;
+ *bpp = bp;
+
+ return 0;
+}
+
+/*
+ * Reconstructing per-AG Btrees
+ *
+ * When a space btree is corrupt, we don't bother trying to fix it. Instead,
+ * we scan secondary space metadata to derive the records that should be in
+ * the damaged btree, initialize a fresh btree root, and insert the records.
+ * Note that for rebuilding the rmapbt we scan all the primary data to
+ * generate the new records.
+ *
+ * However, that leaves the matter of removing all the metadata describing the
+ * old broken structure. For primary metadata we use the rmap data to collect
+ * every extent with a matching rmap owner (exlist); we then iterate all other
+ * metadata structures with the same rmap owner to collect the extents that
+ * cannot be removed (sublist). We then subtract sublist from exlist to
+ * derive the blocks that were used by the old btree. These blocks can be
+ * reaped.
+ *
+ * For rmapbt reconstructions we must use different tactics for extent
+ * collection. First we iterate all primary metadata (this excludes the old
+ * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
+ * records are collected as exlist. The bnobt records are collected as
+ * sublist. As with the other btrees we subtract sublist from exlist, and the
+ * result (since the rmapbt lives in the free space) are the blocks from the
+ * old rmapbt.
+ */
+
+/* Collect a dead btree extent for later disposal. */
+int
+xfs_repair_collect_btree_extent(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len)
+{
+ struct xfs_repair_extent *rex;
+
+ trace_xfs_repair_collect_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
+
+ rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
+ if (!rex)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&rex->list);
+ rex->fsbno = fsbno;
+ rex->len = len;
+ list_add_tail(&rex->list, &exlist->list);
+
+ return 0;
+}
+
+/*
+ * An error happened during the rebuild so the transaction will be cancelled.
+ * The fs will shut down, and the administrator has to unmount and run repair.
+ * Therefore, free all the memory associated with the list so we can die.
+ */
+void
+xfs_repair_cancel_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_btree_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_extent *ap;
+ struct xfs_repair_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_extent, list);
+ bp = container_of(b, struct xfs_repair_extent, list);
+
+ if (ap->fsbno > bp->fsbno)
+ return 1;
+ if (ap->fsbno < bp->fsbno)
+ return -1;
+ return 0;
+}
+
+/*
+ * Remove all the blocks mentioned in @sublist from the extents in @exlist.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @exlist; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sublist. This routine subtracts all the extents
+ * mentioned in sublist from all the extents linked in @exlist, which leaves
+ * @exlist as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * @exlist can be reaped.
+ */
+#define LEFT_ALIGNED (1 << 0)
+#define RIGHT_ALIGNED (1 << 1)
+int
+xfs_repair_subtract_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist)
+{
+ struct list_head *lp;
+ struct xfs_repair_extent *ex;
+ struct xfs_repair_extent *newex;
+ struct xfs_repair_extent *subex;
+ xfs_fsblock_t sub_fsb;
+ xfs_extlen_t sub_len;
+ int state;
+ int error = 0;
+
+ if (list_empty(&exlist->list) || list_empty(&sublist->list))
+ return 0;
+ ASSERT(!list_empty(&sublist->list));
+
+ list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
+ list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
+
+ /*
+ * Now that we've sorted both lists, we iterate exlist once, rolling
+ * forward through sublist and/or exlist as necessary until we find an
+ * overlap or reach the end of either list. We do not reset lp to the
+ * head of exlist nor do we reset subex to the head of sublist. The
+ * list traversal is similar to merge sort, but we're deleting
+ * instead. In this manner we avoid O(n^2) operations.
+ */
+ subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
+ list);
+ lp = exlist->list.next;
+ while (lp != &exlist->list) {
+ ex = list_entry(lp, struct xfs_repair_extent, list);
+
+ /*
+ * Advance subex and/or ex until we find a pair that
+ * intersect or we run out of extents.
+ */
+ while (subex->fsbno + subex->len <= ex->fsbno) {
+ if (list_is_last(&subex->list, &sublist->list))
+ goto out;
+ subex = list_next_entry(subex, list);
+ }
+ if (subex->fsbno >= ex->fsbno + ex->len) {
+ lp = lp->next;
+ continue;
+ }
+
+ /* trim subex to fit the extent we have */
+ sub_fsb = subex->fsbno;
+ sub_len = subex->len;
+ if (subex->fsbno < ex->fsbno) {
+ sub_len -= ex->fsbno - subex->fsbno;
+ sub_fsb = ex->fsbno;
+ }
+ if (sub_len > ex->len)
+ sub_len = ex->len;
+
+ state = 0;
+ if (sub_fsb == ex->fsbno)
+ state |= LEFT_ALIGNED;
+ if (sub_fsb + sub_len == ex->fsbno + ex->len)
+ state |= RIGHT_ALIGNED;
+ switch (state) {
+ case LEFT_ALIGNED:
+ /* Coincides with only the left. */
+ ex->fsbno += sub_len;
+ ex->len -= sub_len;
+ break;
+ case RIGHT_ALIGNED:
+ /* Coincides with only the right. */
+ ex->len -= sub_len;
+ lp = lp->next;
+ break;
+ case LEFT_ALIGNED | RIGHT_ALIGNED:
+ /* Total overlap, just delete ex. */
+ lp = lp->next;
+ list_del(&ex->list);
+ kmem_free(ex);
+ break;
+ case 0:
+ /*
+ * Deleting from the middle: add the new right extent
+ * and then shrink the left extent.
+ */
+ newex = kmem_alloc(sizeof(struct xfs_repair_extent),
+ KM_MAYFAIL);
+ if (!newex) {
+ error = -ENOMEM;
+ goto out;
+ }
+ INIT_LIST_HEAD(&newex->list);
+ newex->fsbno = sub_fsb + sub_len;
+ newex->len = ex->fsbno + ex->len - newex->fsbno;
+ list_add(&newex->list, &ex->list);
+ ex->len = sub_fsb - ex->fsbno;
+ lp = lp->next;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ }
+
+out:
+ return error;
+}
+#undef LEFT_ALIGNED
+#undef RIGHT_ALIGNED
+
+/*
+ * Disposal of Blocks from Old per-AG Btrees
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (exlist) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from exlist. In theory the extents
+ * remaining in exlist are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk. The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked. Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space. Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block. If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok. For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @exlist,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal. If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking. We also assume that the caller already invalidated any
+ * buffers associated with @exlist.
+ */
+
+/*
+ * Invalidate buffers for per-AG btree blocks we're dumping. This function
+ * is not intended for use with file data repairs; we have bunmapi for that.
+ */
+int
+xfs_repair_invalidate_blocks(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t i;
+
+ /*
+ * For each block in each extent, see if there's an incore buffer for
+ * exactly that block; if so, invalidate it. The buffer cache only
+ * lets us look for one buffer at a time, so we have to look one block
+ * at a time. Avoid invalidating AG headers and post-EOFS blocks
+ * because we never own those; and if we can't TRYLOCK the buffer we
+ * assume it's owned by someone else.
+ */
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) {
+ /* Skip AG headers and post-EOFS blocks */
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ continue;
+ bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
+ if (bp) {
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* Ensure the freelist is the correct size. */
+int
+xfs_repair_fix_freelist(
+ struct xfs_scrub_context *sc,
+ bool can_shrink)
+{
+ struct xfs_alloc_arg args = {0};
+
+ args.mp = sc->mp;
+ args.tp = sc->tp;
+ args.agno = sc->sa.agno;
+ args.alignment = 1;
+ args.pag = sc->sa.pag;
+
+ return xfs_alloc_fix_freelist(&args,
+ can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+}
+
+/*
+ * Put a block back on the AGFL.
+ */
+STATIC int
+xfs_repair_put_freelist(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno)
+{
+ struct xfs_owner_info oinfo;
+ int error;
+
+ /* Make sure there's space on the freelist. */
+ error = xfs_repair_fix_freelist(sc, true);
+ if (error)
+ return error;
+
+ /*
+ * Since we're "freeing" a lost block onto the AGFL, we have to
+ * create an rmap for the block prior to merging it or else other
+ * parts will break.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+ &oinfo);
+ if (error)
+ return error;
+
+ /* Put the block on the AGFL. */
+ error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
+ agbno, 0);
+ if (error)
+ return error;
+ xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ return 0;
+}
+
+/* Dispose of a single metadata block. */
+STATIC int
+xfs_repair_dispose_btree_block(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsbno,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ bool has_other_rmap;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ /*
+ * If we are repairing per-inode metadata, we need to read in the AGF
+ * buffer. Otherwise, we're repairing a per-AG structure, so reuse
+ * the AGF buffer that the setup functions already grabbed.
+ */
+ if (sc->ip) {
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ } else {
+ agf_bp = sc->sa.agf_bp;
+ }
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+
+ /* Can we find any other rmappings? */
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+ if (error)
+ goto out_cur;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the reverse mapping and move on. Otherwise,
+ * we were the only owner of the block, so free the extent, which will
+ * also remove the rmap.
+ *
+ * XXX: XFS doesn't support detecting the case where a single block
+ * metadata structure is crosslinked with a multi-block structure
+ * because the buffer cache doesn't detect aliasing problems, so we
+ * can't fix 100% of crosslinking problems (yet). The verifiers will
+ * blow on writeout, the filesystem will shut down, and the admin gets
+ * to run xfs_repair.
+ */
+ if (has_other_rmap)
+ error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+ else if (resv == XFS_AG_RESV_AGFL)
+ error = xfs_repair_put_freelist(sc, agbno);
+ else
+ error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ if (error)
+ return error;
+
+ if (sc->ip)
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+ return xfs_repair_roll_ag_trans(sc);
+
+out_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ return error;
+}
+
+/* Dispose of btree blocks from an old per-AG btree. */
+int
+xfs_repair_reap_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ int error = 0;
+
+ ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
+
+ /* Dispose of every block from the old btree. */
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ ASSERT(sc->ip != NULL ||
+ XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno);
+
+ trace_xfs_repair_dispose_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, rex->fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len);
+
+ for (; rex->len > 0; rex->len--, rex->fsbno++) {
+ error = xfs_repair_dispose_btree_block(sc, rex->fsbno,
+ oinfo, type);
+ if (error)
+ goto out;
+ }
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+
+out:
+ xfs_repair_cancel_btree_extents(sc, exlist);
+ return error;
+}
+
+/*
+ * Finding per-AG Btree Roots for AGF/AGI Reconstruction
+ *
+ * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
+ * the AG headers by using the rmap data to rummage through the AG looking for
+ * btree roots. This is not guaranteed to work if the AG is heavily damaged
+ * or the rmap data are corrupt.
+ *
+ * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL
+ * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
+ * AGI is being rebuilt. It must maintain these locks until it's safe for
+ * other threads to change the btrees' shapes. The caller provides
+ * information about the btrees to look for by passing in an array of
+ * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
+ * The (root, height) fields will be set on return if anything is found. The
+ * last element of the array should have a NULL buf_ops to mark the end of the
+ * array.
+ *
+ * For every rmapbt record matching any of the rmap owners in btree_info,
+ * read each block referenced by the rmap record. If the block is a btree
+ * block from this filesystem matching any of the magic numbers and has a
+ * level higher than what we've already seen, remember the block and the
+ * height of the tree required to have such a block. When the call completes,
+ * we return the highest block we've found for each btree description; those
+ * should be the roots.
+ */
+
+struct xfs_repair_findroot {
+ struct xfs_scrub_context *sc;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf;
+ struct xfs_repair_find_ag_btree *btree_info;
+};
+
+/* See if our block is in the AGFL. */
+STATIC int
+xfs_repair_findroot_agfl_walk(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ xfs_agblock_t *agbno = priv;
+
+ return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+}
+
+/* Does this block match the btree information passed in? */
+STATIC int
+xfs_repair_findroot_block(
+ struct xfs_repair_findroot *ri,
+ struct xfs_repair_find_ag_btree *fab,
+ uint64_t owner,
+ xfs_agblock_t agbno,
+ bool *found_it)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_btree_block *btblock;
+ xfs_daddr_t daddr;
+ int error;
+
+ daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+
+ /*
+ * Blocks in the AGFL have stale contents that might just happen to
+ * have a matching magic and uuid. We don't want to pull these blocks
+ * in as part of a tree root, so we have to filter out the AGFL stuff
+ * here. If the AGFL looks insane we'll just refuse to repair.
+ */
+ if (owner == XFS_RMAP_OWN_AG) {
+ error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
+ xfs_repair_findroot_agfl_walk, &agbno);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ return 0;
+ if (error)
+ return error;
+ }
+
+ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
+ mp->m_bsize, 0, &bp, NULL);
+ if (error)
+ return error;
+
+ /*
+ * Does this look like a block matching our fs and higher than any
+ * other block we've found so far? If so, reattach buffer verifiers
+ * so the AIL won't complain if the buffer is also dirty.
+ */
+ btblock = XFS_BUF_TO_BLOCK(bp);
+ if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ goto out;
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ goto out;
+ bp->b_ops = fab->buf_ops;
+
+ /* Ignore this block if it's lower in the tree than we've seen. */
+ if (fab->root != NULLAGBLOCK &&
+ xfs_btree_get_level(btblock) < fab->height)
+ goto out;
+
+ /* Make sure we pass the verifiers. */
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ goto out;
+ fab->root = agbno;
+ fab->height = xfs_btree_get_level(btblock) + 1;
+ *found_it = true;
+
+ trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
+ be32_to_cpu(btblock->bb_magic), fab->height - 1);
+out:
+ xfs_trans_brelse(ri->sc->tp, bp);
+ return error;
+}
+
+/*
+ * Do any of the blocks in this rmap record match one of the btrees we're
+ * looking for?
+ */
+STATIC int
+xfs_repair_findroot_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_findroot *ri = priv;
+ struct xfs_repair_find_ag_btree *fab;
+ xfs_agblock_t b;
+ bool found_it;
+ int error = 0;
+
+ /* Ignore anything that isn't AG metadata. */
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+ return 0;
+
+ /* Otherwise scan each block + btree type. */
+ for (b = 0; b < rec->rm_blockcount; b++) {
+ found_it = false;
+ for (fab = ri->btree_info; fab->buf_ops; fab++) {
+ if (rec->rm_owner != fab->rmap_owner)
+ continue;
+ error = xfs_repair_findroot_block(ri, fab,
+ rec->rm_owner, rec->rm_startblock + b,
+ &found_it);
+ if (error)
+ return error;
+ if (found_it)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* Find the roots of the per-AG btrees described in btree_info. */
+int
+xfs_repair_find_ag_btree_roots(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_repair_findroot ri;
+ struct xfs_repair_find_ag_btree *fab;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(xfs_buf_islocked(agf_bp));
+ ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
+
+ ri.sc = sc;
+ ri.btree_info = btree_info;
+ ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agfl_bp = agfl_bp;
+ for (fab = btree_info; fab->buf_ops; fab++) {
+ ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+ ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
+ fab->root = NULLAGBLOCK;
+ fab->height = 0;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+ return error;
+}
+
+/* Force a quotacheck the next time we mount. */
+void
+xfs_repair_force_quotacheck(
+ struct xfs_scrub_context *sc,
+ uint dqtype)
+{
+ uint flag;
+
+ flag = xfs_quota_chkd_flag(dqtype);
+ if (!(flag & sc->mp->m_qflags))
+ return;
+
+ sc->mp->m_qflags &= ~flag;
+ spin_lock(&sc->mp->m_sb_lock);
+ sc->mp->m_sb.sb_qflags &= ~flag;
+ spin_unlock(&sc->mp->m_sb_lock);
+ xfs_log_sb(sc->tp);
+}
+
+/*
+ * Attach dquots to this inode, or schedule quotacheck to fix them.
+ *
+ * This function ensures that the appropriate dquots are attached to an inode.
+ * We cannot allow the dquot code to allocate an on-disk dquot block here
+ * because we're already in transaction context with the inode locked. The
+ * on-disk dquot should already exist anyway. If the quota code signals
+ * corruption or missing quota information, schedule quotacheck, which will
+ * repair corruptions in the quota metadata.
+ */
+int
+xfs_repair_ino_dqattach(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_qm_dqattach_locked(sc->ip, false);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ case -ENOENT:
+ xfs_err_ratelimited(sc->mp,
+"inode %llu repair encountered quota error %d, quotacheck forced.",
+ (unsigned long long)sc->ip->i_ino, error);
+ if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
+ /* fall through */
+ case -ESRCH:
+ error = 0;
+ break;
+ default:
+ break;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
new file mode 100644
index 000000000000..f2b0895294db
--- /dev/null
+++ b/fs/xfs/scrub/repair.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_REPAIR_H__
+#define __XFS_SCRUB_REPAIR_H__
+
+static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
+{
+ return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+
+/* Repair helpers */
+
+int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
+ bool *fixed);
+void xfs_repair_failure(struct xfs_mount *mp);
+int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
+bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
+int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv);
+int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
+ struct xfs_buf **bpp, xfs_btnum_t btnum,
+ const struct xfs_buf_ops *ops);
+
+struct xfs_repair_extent {
+ struct list_head list;
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+struct xfs_repair_extent_list {
+ struct list_head list;
+};
+
+static inline void
+xfs_repair_init_extent_list(
+ struct xfs_repair_extent_list *exlist)
+{
+ INIT_LIST_HEAD(&exlist->list);
+}
+
+#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
+ list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
+ xfs_extlen_t len);
+void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist);
+int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
+int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+
+struct xfs_repair_find_ag_btree {
+ /* in: rmap owner of the btree we're looking for */
+ uint64_t rmap_owner;
+
+ /* in: buffer ops */
+ const struct xfs_buf_ops *buf_ops;
+
+ /* in: magic number of the btree */
+ uint32_t magic;
+
+ /* out: the highest btree block found and the tree height */
+ xfs_agblock_t root;
+ unsigned int height;
+};
+
+int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp);
+void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
+int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
+
+/* Metadata repairers */
+
+int xfs_repair_probe(struct xfs_scrub_context *sc);
+int xfs_repair_superblock(struct xfs_scrub_context *sc);
+
+#else
+
+static inline int xfs_repair_attempt(
+ struct xfs_inode *ip,
+ struct xfs_scrub_context *sc,
+ bool *fixed)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void xfs_repair_failure(struct xfs_mount *mp) {}
+
+static inline xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
+ return 0;
+}
+
+#define xfs_repair_probe xfs_repair_notsupported
+#define xfs_repair_superblock xfs_repair_notsupported
+
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_REPAIR_H__ */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8f2a7c3ff455..b376a9a77c04 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -66,7 +66,7 @@ xfs_scrub_rmapbt_xref_refc(
bool is_unwritten;
int error;
- if (!sc->sa.refc_cur)
+ if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
return;
non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
@@ -207,7 +207,7 @@ xfs_scrub_xref_check_owner(
bool has_rmap;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
@@ -250,7 +250,7 @@ xfs_scrub_xref_has_no_owner(
bool has_rmap;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 39c41dfe08ee..40f462a11ea5 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -66,11 +66,15 @@ xfs_scrub_rtbitmap_rec(
void *priv)
{
struct xfs_scrub_context *sc = priv;
+ xfs_rtblock_t startblock;
+ xfs_rtblock_t blockcount;
- if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock ||
- !xfs_verify_rtbno(sc->mp, rec->ar_startblock) ||
- !xfs_verify_rtbno(sc->mp, rec->ar_startblock +
- rec->ar_blockcount - 1))
+ startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+
+ if (startblock + blockcount <= startblock ||
+ !xfs_verify_rtbno(sc->mp, startblock) ||
+ !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1))
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -82,6 +86,11 @@ xfs_scrub_rtbitmap(
{
int error;
+ /* Invoke the fork scrubber. */
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc);
if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
@@ -95,8 +104,35 @@ int
xfs_scrub_rtsummary(
struct xfs_scrub_context *sc)
{
+ struct xfs_inode *rsumip = sc->mp->m_rsumip;
+ struct xfs_inode *old_ip = sc->ip;
+ uint old_ilock_flags = sc->ilock_flags;
+ int error = 0;
+
+ /*
+ * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
+ * rt summary ip in compliance with the rt inode locking rules.
+ *
+ * Since we switch sc->ip to rsumip we have to save the old ilock
+ * flags so that we don't mix up the inode state that @sc tracks.
+ */
+ sc->ip = rsumip;
+ sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /* Invoke the fork scrubber. */
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ goto out;
+
/* XXX: implement this some day */
- return -ENOENT;
+ xfs_scrub_set_incomplete(sc);
+out:
+ /* Switch back to the rtbitmap inode and lock flags. */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = old_ilock_flags;
+ sc->ip = old_ip;
+ return error;
}
@@ -107,11 +143,23 @@ xfs_scrub_xref_is_used_rt_space(
xfs_rtblock_t fsbno,
xfs_extlen_t len)
{
+ xfs_rtblock_t startext;
+ xfs_rtblock_t endext;
+ xfs_rtblock_t extcount;
bool is_free;
int error;
+ if (xfs_scrub_skip_xref(sc->sm))
+ return;
+
+ startext = fsbno;
+ endext = fsbno + len - 1;
+ do_div(startext, sc->mp->m_sb.sb_rextsize);
+ if (do_div(endext, sc->mp->m_sb.sb_rextsize))
+ endext++;
+ extcount = endext - startext;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len,
+ error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
if (!xfs_scrub_should_check_xref(sc, &error, NULL))
goto out_unlock;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 26c75967a072..36db098ba583 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -42,11 +42,18 @@
#include "xfs_refcount_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/btree.h"
+#include "scrub/repair.h"
/*
* Online Scrub and Repair
@@ -120,6 +127,24 @@
* XCORRUPT flag; btree query function errors are noted by setting the
* XFAIL flag and deleting the cursor to prevent further attempts to
* cross-reference with a defective btree.
+ *
+ * If a piece of metadata proves corrupt or suboptimal, the userspace
+ * program can ask the kernel to apply some tender loving care (TLC) to
+ * the metadata object by setting the REPAIR flag and re-calling the
+ * scrub ioctl. "Corruption" is defined by metadata violating the
+ * on-disk specification; operations cannot continue if the violation is
+ * left untreated. It is possible for XFS to continue if an object is
+ * "suboptimal", however performance may be degraded. Repairs are
+ * usually performed by rebuilding the metadata entirely out of
+ * redundant metadata. Optimizing, on the other hand, can sometimes be
+ * done without rebuilding entire structures.
+ *
+ * Generally speaking, the repair code has the following code structure:
+ * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
+ * The first check helps us figure out if we need to rebuild or simply
+ * optimize the structure so that the rebuild knows what to do. The
+ * second check evaluates the completeness of the repair; that is what
+ * is reported to userspace.
*/
/*
@@ -155,7 +180,10 @@ xfs_scrub_teardown(
{
xfs_scrub_ag_free(sc, &sc->sa);
if (sc->tp) {
- xfs_trans_cancel(sc->tp);
+ if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ error = xfs_trans_commit(sc->tp);
+ else
+ xfs_trans_cancel(sc->tp);
sc->tp = NULL;
}
if (sc->ip) {
@@ -166,6 +194,8 @@ xfs_scrub_teardown(
iput(VFS_I(sc->ip));
sc->ip = NULL;
}
+ if (sc->has_quotaofflock)
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (sc->buf) {
kmem_free(sc->buf);
sc->buf = NULL;
@@ -180,126 +210,150 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_NONE,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_probe,
+ .repair = xfs_repair_probe,
},
[XFS_SCRUB_TYPE_SB] = { /* superblock */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_superblock,
+ .repair = xfs_repair_superblock,
},
[XFS_SCRUB_TYPE_AGF] = { /* agf */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agf,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_AGFL]= { /* agfl */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agfl,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_AGI] = { /* agi */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agi,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_allocbt,
.scrub = xfs_scrub_bnobt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_allocbt,
.scrub = xfs_scrub_cntbt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_iallocbt,
.scrub = xfs_scrub_inobt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_iallocbt,
.scrub = xfs_scrub_finobt,
.has = xfs_sb_version_hasfinobt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_rmapbt,
.scrub = xfs_scrub_rmapbt,
.has = xfs_sb_version_hasrmapbt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_refcountbt,
.scrub = xfs_scrub_refcountbt,
.has = xfs_sb_version_hasreflink,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode,
.scrub = xfs_scrub_inode,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_data,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_attr,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_cow,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_DIR] = { /* directory */
.type = ST_INODE,
.setup = xfs_scrub_setup_directory,
.scrub = xfs_scrub_directory,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
.type = ST_INODE,
.setup = xfs_scrub_setup_xattr,
.scrub = xfs_scrub_xattr,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
.type = ST_INODE,
.setup = xfs_scrub_setup_symlink,
.scrub = xfs_scrub_symlink,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
.type = ST_INODE,
.setup = xfs_scrub_setup_parent,
.scrub = xfs_scrub_parent,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
.setup = xfs_scrub_setup_rt,
.scrub = xfs_scrub_rtbitmap,
.has = xfs_sb_version_hasrealtime,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xfs_scrub_setup_rt,
.scrub = xfs_scrub_rtsummary,
.has = xfs_sb_version_hasrealtime,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
+ .repair = xfs_repair_notsupported,
},
};
@@ -379,15 +433,54 @@ xfs_scrub_validate_inputs(
if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
goto out;
- /* We don't know how to repair anything yet. */
- if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
- goto out;
+ /*
+ * We only want to repair read-write v5+ filesystems. Defer the check
+ * for ops->repair until after our scrub confirms that we need to
+ * perform repairs so that we avoid failing due to not supporting
+ * repairing an object that doesn't need repairs.
+ */
+ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+ error = -EOPNOTSUPP;
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ goto out;
+
+ error = -EROFS;
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ goto out;
+ }
error = 0;
out:
return error;
}
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
+{
+ /*
+ * Userspace asked us to repair something, we repaired it, rescanned
+ * it, and the rescan says it's still broken. Scream about this in
+ * the system logs.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT)))
+ xfs_repair_failure(sc->mp);
+}
+#else
+static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
+{
+ /*
+ * Userspace asked us to scrub something, it's broken, and we have no
+ * way of fixing it. Scream in the logs.
+ */
+ if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT))
+ xfs_alert_ratelimited(sc->mp,
+ "Corruption detected during scrub.");
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
/* Dispatch metadata scrubbing. */
int
xfs_scrub_metadata(
@@ -397,6 +490,7 @@ xfs_scrub_metadata(
struct xfs_scrub_context sc;
struct xfs_mount *mp = ip->i_mount;
bool try_harder = false;
+ bool already_fixed = false;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -446,10 +540,44 @@ retry_op:
} else if (error)
goto out_teardown;
- if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT))
- xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
+ if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
+ bool needs_fix;
+
+ /* Let debug users force us into the repair routines. */
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN));
+ /*
+ * If userspace asked for a repair but it wasn't necessary,
+ * report that back to userspace.
+ */
+ if (!needs_fix) {
+ sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+ goto out_nofix;
+ }
+
+ /*
+ * If it's broken, userspace wants us to fix it, and we haven't
+ * already tried to fix it, then attempt a repair.
+ */
+ error = xfs_repair_attempt(ip, &sc, &already_fixed);
+ if (error == -EAGAIN) {
+ if (sc.try_harder)
+ try_harder = true;
+ error = xfs_scrub_teardown(&sc, ip, 0);
+ if (error) {
+ xfs_repair_failure(mp);
+ goto out;
+ }
+ goto retry_op;
+ }
+ }
+out_nofix:
+ xfs_scrub_postmortem(&sc);
out_teardown:
error = xfs_scrub_teardown(&sc, ip, error);
out:
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 0d92af86f67a..636424d5e2ee 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -38,6 +38,9 @@ struct xfs_scrub_meta_ops {
/* Examine metadata for errors. */
int (*scrub)(struct xfs_scrub_context *);
+ /* Repair or optimize the metadata. */
+ int (*repair)(struct xfs_scrub_context *);
+
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_sb *);
@@ -48,6 +51,7 @@ struct xfs_scrub_meta_ops {
/* Buffer pointers and btree cursors for an entire AG. */
struct xfs_scrub_ag {
xfs_agnumber_t agno;
+ struct xfs_perag *pag;
/* AG btree roots */
struct xfs_buf *agf_bp;
@@ -73,6 +77,7 @@ struct xfs_scrub_context {
void *buf;
uint ilock_flags;
bool try_harder;
+ bool has_quotaofflock;
/* State tracking for single-AG operations. */
struct xfs_scrub_ag sa;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 5d2b1c241be5..794d56bb1af8 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -69,6 +69,8 @@ DEFINE_EVENT(xfs_scrub_class, name, \
DEFINE_SCRUB_EVENT(xfs_scrub_start);
DEFINE_SCRUB_EVENT(xfs_scrub_done);
DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry);
+DEFINE_SCRUB_EVENT(xfs_repair_attempt);
+DEFINE_SCRUB_EVENT(xfs_repair_done);
TRACE_EVENT(xfs_scrub_op_error,
TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
@@ -492,6 +494,262 @@ TRACE_EVENT(xfs_scrub_xref_error,
__entry->ret_ip)
);
+/* repair tracepoints */
+#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
+
+DECLARE_EVENT_CLASS(xfs_repair_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_REPAIR_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_repair_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_dispose_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_collect_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_agfl_insert);
+
+DECLARE_EVENT_CLASS(xfs_repair_rmap_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ uint64_t owner, uint64_t offset, unsigned int flags),
+ TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
+ __entry->offset = offset;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+#define DEFINE_REPAIR_RMAP_EVENT(name) \
+DEFINE_EVENT(xfs_repair_rmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ uint64_t owner, uint64_t offset, unsigned int flags), \
+ TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_alloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_rmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_bmap_extent_fn);
+
+TRACE_EVENT(xfs_repair_refcount_extent_fn,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *irec),
+ TP_ARGS(mp, agno, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startblock = irec->rc_startblock;
+ __entry->blockcount = irec->rc_blockcount;
+ __entry->refcount = irec->rc_refcount;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->refcount)
+)
+
+TRACE_EVENT(xfs_repair_init_btblock,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_btnum_t btnum),
+ TP_ARGS(mp, agno, agbno, btnum),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(uint32_t, btnum)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->btnum = btnum;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u btnum %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->btnum)
+)
+TRACE_EVENT(xfs_repair_findroot_block,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ uint32_t magic, uint16_t level),
+ TP_ARGS(mp, agno, agbno, magic, level),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(uint32_t, magic)
+ __field(uint16_t, level)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->magic = magic;
+ __entry->level = level;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->magic,
+ __entry->level)
+)
+TRACE_EVENT(xfs_repair_calc_ag_resblks,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen,
+ xfs_agblock_t usedlen),
+ TP_ARGS(mp, agno, icount, aglen, freelen, usedlen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, icount)
+ __field(xfs_agblock_t, aglen)
+ __field(xfs_agblock_t, freelen)
+ __field(xfs_agblock_t, usedlen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->icount = icount;
+ __entry->aglen = aglen;
+ __entry->freelen = freelen;
+ __entry->usedlen = usedlen;
+ ),
+ TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->icount,
+ __entry->aglen,
+ __entry->freelen,
+ __entry->usedlen)
+)
+TRACE_EVENT(xfs_repair_calc_ag_resblks_btsize,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz,
+ xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz),
+ TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bnobt_sz)
+ __field(xfs_agblock_t, inobt_sz)
+ __field(xfs_agblock_t, rmapbt_sz)
+ __field(xfs_agblock_t, refcbt_sz)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bnobt_sz = bnobt_sz;
+ __entry->inobt_sz = inobt_sz;
+ __entry->rmapbt_sz = rmapbt_sz;
+ __entry->refcbt_sz = refcbt_sz;
+ ),
+ TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bnobt_sz,
+ __entry->inobt_sz,
+ __entry->rmapbt_sz,
+ __entry->refcbt_sz)
+)
+TRACE_EVENT(xfs_repair_reset_counters,
+ TP_PROTO(struct xfs_mount *mp),
+ TP_ARGS(mp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ ),
+ TP_printk("dev %d:%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev))
+)
+
+TRACE_EVENT(xfs_repair_ialloc_insert,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, uint16_t holemask, uint8_t count,
+ uint8_t freecount, uint64_t freemask),
+ TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint16_t, holemask)
+ __field(uint8_t, count)
+ __field(uint8_t, freecount)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->holemask = holemask;
+ __entry->count = count;
+ __entry->freecount = freecount;
+ __entry->freemask = freemask;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->holemask,
+ __entry->count,
+ __entry->freecount,
+ __entry->freemask)
+)
+
+#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
+
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 19eadc807056..ca6903726689 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -594,7 +594,7 @@ xfs_alloc_ioend(
struct xfs_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
xfs_init_bio_from_bh(bio, bh);
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
@@ -1195,16 +1195,22 @@ xfs_vm_writepages(
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- if (dax_mapping(mapping))
- return dax_writeback_mapping_range(mapping,
- xfs_find_bdev_for_inode(mapping->host), wbc);
-
ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
if (wpc.ioend)
ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
return ret;
}
+STATIC int
+xfs_dax_writepages(
+ struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ return dax_writeback_mapping_range(mapping,
+ xfs_find_bdev_for_inode(mapping->host), wbc);
+}
+
/*
* Called to move a page into cleanable state - and from there
* to be released. The page should already be clean. We always
@@ -1367,30 +1373,18 @@ out_unlock:
return error;
}
-STATIC ssize_t
-xfs_vm_direct_IO(
- struct kiocb *iocb,
- struct iov_iter *iter)
-{
- /*
- * We just need the method present so that open/fcntl allow direct I/O.
- */
- return -EINVAL;
-}
-
STATIC sector_t
xfs_vm_bmap(
struct address_space *mapping,
sector_t block)
{
- struct inode *inode = (struct inode *)mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(mapping->host);
- trace_xfs_vm_bmap(XFS_I(inode));
+ trace_xfs_vm_bmap(ip);
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
- * bypasseѕ the file system for actual I/O. We really can't allow
+ * bypasses the file system for actual I/O. We really can't allow
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error.
*
@@ -1399,9 +1393,7 @@ xfs_vm_bmap(
*/
if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
-
- filemap_write_and_wait(mapping);
- return generic_block_bmap(mapping, block, xfs_get_blocks);
+ return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
STATIC int
@@ -1472,25 +1464,24 @@ xfs_vm_set_page_dirty(
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
- if (newly_dirty) {
- /* sigh - __set_page_dirty() is static, so copy it here, too */
- unsigned long flags;
-
- spin_lock_irqsave(&mapping->tree_lock, flags);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- }
+ if (newly_dirty)
+ __set_page_dirty(page, mapping, 1);
unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
+static int
+xfs_iomap_swapfile_activate(
+ struct swap_info_struct *sis,
+ struct file *swap_file,
+ sector_t *span)
+{
+ sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
+ return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
+}
+
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
@@ -1500,8 +1491,17 @@ const struct address_space_operations xfs_address_space_operations = {
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.bmap = xfs_vm_bmap,
- .direct_IO = xfs_vm_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = xfs_iomap_swapfile_activate,
+};
+
+const struct address_space_operations xfs_dax_aops = {
+ .writepages = xfs_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .set_page_dirty = noop_set_page_dirty,
+ .invalidatepage = noop_invalidatepage,
+ .swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 88c85ea63da0..694c85b03813 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct bio_set *xfs_ioend_bioset;
+extern struct bio_set xfs_ioend_bioset;
/*
* Types of I/O for bmap clustering and I/O completion tracking.
@@ -54,6 +54,7 @@ struct xfs_ioend {
};
extern const struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e5fb008d75e8..618bb71535c8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -53,6 +53,25 @@ xfs_bui_item_free(
kmem_zone_free(xfs_bui_zone, buip);
}
+/*
+ * Freeing the BUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the BUI may not yet have been placed in the AIL
+ * when called by xfs_bui_release() from BUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the BUI.
+ */
+void
+xfs_bui_release(
+ struct xfs_bui_log_item *buip)
+{
+ ASSERT(atomic_read(&buip->bui_refcount) > 0);
+ if (atomic_dec_and_test(&buip->bui_refcount)) {
+ xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_bui_item_free(buip);
+ }
+}
+
+
STATIC void
xfs_bui_item_size(
struct xfs_log_item *lip,
@@ -141,8 +160,8 @@ STATIC void
xfs_bui_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_bui_item_free(BUI_ITEM(lip));
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
+ xfs_bui_release(BUI_ITEM(lip));
}
/*
@@ -206,24 +225,6 @@ xfs_bui_init(
return buip;
}
-/*
- * Freeing the BUI requires that we remove it from the AIL if it has already
- * been placed there. However, the BUI may not yet have been placed in the AIL
- * when called by xfs_bui_release() from BUD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the BUI.
- */
-void
-xfs_bui_release(
- struct xfs_bui_log_item *buip)
-{
- ASSERT(atomic_read(&buip->bui_refcount) > 0);
- if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_bui_item_free(buip);
- }
-}
-
static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_bud_log_item, bud_item);
@@ -304,7 +305,7 @@ xfs_bud_item_unlock(
{
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_bui_release(budp->bud_buip);
kmem_zone_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 05dee8fdd895..06badcbadeb4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -848,7 +848,7 @@ xfs_free_eofblocks(
/*
* Attach the dquots to the inode up front.
*/
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -871,8 +871,8 @@ xfs_free_eofblocks(
* contents of the file are flushed to disk then the files
* may be full of holes (ie NULL files bug).
*/
- error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
- XFS_ISIZE(ip));
+ error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
if (error) {
/*
* If we get an error at this point we simply don't
@@ -918,7 +918,7 @@ xfs_alloc_file_space(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1169,7 +1169,7 @@ xfs_free_file_space(
trace_xfs_free_file_space(ip);
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1326,7 +1326,6 @@ xfs_collapse_file_space(
int error;
struct xfs_defer_ops dfops;
xfs_fsblock_t first_block;
- xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
@@ -1361,7 +1360,7 @@ xfs_collapse_file_space(
xfs_defer_init(&dfops, &first_block);
error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
- &done, stop_fsb, &first_block, &dfops);
+ &done, &first_block, &dfops);
if (error)
goto out_bmap_cancel;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ac669a10c62f..5179ab9e3d6a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -549,17 +549,31 @@ xfs_buf_hash_destroy(
}
/*
- * Look up, and creates if absent, a lockable buffer for
- * a given range of an inode. The buffer is returned
- * locked. No I/O is implied by this call.
+ * Look up a buffer in the buffer cache and return it referenced and locked
+ * in @found_bp.
+ *
+ * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
+ * cache.
+ *
+ * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
+ * -EAGAIN if we fail to lock it.
+ *
+ * Return values are:
+ * -EFSCORRUPTED if have been supplied with an invalid address
+ * -EAGAIN on trylock failure
+ * -ENOENT if we fail to find a match and @new_bp was NULL
+ * 0, with @found_bp:
+ * - @new_bp if we inserted it into the cache
+ * - the buffer we found and locked.
*/
-xfs_buf_t *
-_xfs_buf_find(
+static int
+xfs_buf_find(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
- xfs_buf_t *new_bp)
+ struct xfs_buf *new_bp,
+ struct xfs_buf **found_bp)
{
struct xfs_perag *pag;
xfs_buf_t *bp;
@@ -567,6 +581,8 @@ _xfs_buf_find(
xfs_daddr_t eofs;
int i;
+ *found_bp = NULL;
+
for (i = 0; i < nmaps; i++)
cmap.bm_len += map[i].bm_len;
@@ -580,16 +596,11 @@ _xfs_buf_find(
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
- /*
- * XXX (dgc): we should really be returning -EFSCORRUPTED here,
- * but none of the higher level infrastructure supports
- * returning a specific error on buffer lookup failures.
- */
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
__func__, cmap.bm_bn, eofs);
WARN_ON(1);
- return NULL;
+ return -EFSCORRUPTED;
}
pag = xfs_perag_get(btp->bt_mount,
@@ -604,19 +615,20 @@ _xfs_buf_find(
}
/* No match found */
- if (new_bp) {
- /* the buffer keeps the perag reference until it is freed */
- new_bp->b_pag = pag;
- rhashtable_insert_fast(&pag->pag_buf_hash,
- &new_bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- } else {
+ if (!new_bp) {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
+ return -ENOENT;
}
- return new_bp;
+
+ /* the buffer keeps the perag reference until it is freed */
+ new_bp->b_pag = pag;
+ rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
+ xfs_buf_hash_params);
+ spin_unlock(&pag->pag_buf_lock);
+ *found_bp = new_bp;
+ return 0;
found:
spin_unlock(&pag->pag_buf_lock);
@@ -626,7 +638,7 @@ found:
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
- return NULL;
+ return -EAGAIN;
}
xfs_buf_lock(bp);
XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
@@ -646,6 +658,24 @@ found:
trace_xfs_buf_find(bp, flags, _RET_IP_);
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+ *found_bp = bp;
+ return 0;
+}
+
+struct xfs_buf *
+xfs_buf_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ struct xfs_buf *bp;
+ int error;
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+ error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
+ if (error)
+ return NULL;
return bp;
}
@@ -665,9 +695,27 @@ xfs_buf_get_map(
struct xfs_buf *new_bp;
int error = 0;
- bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
- if (likely(bp))
+ error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
+
+ switch (error) {
+ case 0:
+ /* cache hit */
goto found;
+ case -EAGAIN:
+ /* cache hit, trylock failure, caller handles failure */
+ ASSERT(flags & XBF_TRYLOCK);
+ return NULL;
+ case -ENOENT:
+ /* cache miss, go for insert */
+ break;
+ case -EFSCORRUPTED:
+ default:
+ /*
+ * None of the higher layers understand failure types
+ * yet, so return NULL to signal a fatal lookup error.
+ */
+ return NULL;
+ }
new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
if (unlikely(!new_bp))
@@ -679,8 +727,8 @@ xfs_buf_get_map(
return NULL;
}
- bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
- if (!bp) {
+ error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
+ if (error) {
xfs_buf_free(new_bp);
return NULL;
}
@@ -1754,7 +1802,6 @@ xfs_buftarg_shrink_count(
void
xfs_free_buftarg(
- struct xfs_mount *mp,
struct xfs_buftarg *btp)
{
unregister_shrinker(&btp->bt_shrinker);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 2f4c91452861..f5f2b71c2fde 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -218,20 +218,9 @@ typedef struct xfs_buf {
} xfs_buf_t;
/* Finding and Reading Buffers */
-struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags, struct xfs_buf *new_bp);
-
-static inline struct xfs_buf *
-xfs_incore(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
-{
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return _xfs_buf_find(target, &map, 1, flags, NULL);
-}
+struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
+ xfs_daddr_t blkno, size_t numblks,
+ xfs_buf_flags_t flags);
struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
@@ -358,6 +347,18 @@ extern void xfs_buf_terminate(void);
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
+/*
+ * If the buffer is already on the LRU, do nothing. Otherwise set the buffer
+ * up with a reference count of 0 so it will be tossed from the cache when
+ * released.
+ */
+static inline void xfs_buf_oneshot(struct xfs_buf *bp)
+{
+ if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1)
+ return;
+ atomic_set(&bp->b_lru_ref, 0);
+}
+
static inline int xfs_buf_ispinned(struct xfs_buf *bp)
{
return atomic_read(&bp->b_pin_count);
@@ -388,7 +389,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
*/
extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
struct block_device *, struct dax_device *);
-extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
+extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 82ad270e390e..c2311379d1c3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -438,7 +438,7 @@ xfs_buf_item_unpin(
* xfs_trans_uncommit() will try to reference the
* buffer which we no longer have a hold on.
*/
- if (lip->li_desc)
+ if (!list_empty(&lip->li_trans))
xfs_trans_del_item(lip);
/*
@@ -568,13 +568,15 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
+ bool aborted;
bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
#if defined(DEBUG) || defined(XFS_WARN)
bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
#endif
+ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags);
+
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
@@ -743,8 +745,10 @@ xfs_buf_item_init(
* nothing to do here so return.
*/
ASSERT(bp->b_target->bt_mount == mp);
- if (bip != NULL) {
+ if (bip) {
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ ASSERT(!bp->b_transp);
+ ASSERT(bip->bli_buf == bp);
return 0;
}
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index b2cde5426182..7b68e6c9a474 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -50,19 +50,19 @@ xfs_trim_extents(
pag = xfs_perag_get(mp, agno);
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
- if (error || !agbp)
- goto out_put_perag;
-
- cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
-
/*
* Force out the log. This means any transactions that might have freed
- * space before we took the AGF buffer lock are now on disk, and the
+ * space before we take the AGF buffer lock are now on disk, and the
* volatile disk cache is flushed.
*/
xfs_log_force(mp, XFS_LOG_SYNC);
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error || !agbp)
+ goto out_put_perag;
+
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
/*
* Look up the longest btree in the AGF and start with it.
*/
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a7daef9e16bf..2567391489bd 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -288,49 +288,43 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
}
/*
- * Allocate a block and fill it with dquots.
- * This is called when the bmapi finds a hole.
+ * Ensure that the given in-core dquot has a buffer on disk backing it, and
+ * return the buffer. This is called when the bmapi finds a hole.
*/
STATIC int
-xfs_qm_dqalloc(
- xfs_trans_t **tpp,
- xfs_mount_t *mp,
- xfs_dquot_t *dqp,
- xfs_inode_t *quotip,
- xfs_fileoff_t offset_fsb,
- xfs_buf_t **O_bpp)
+xfs_dquot_disk_alloc(
+ struct xfs_trans **tpp,
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
{
- xfs_fsblock_t firstblock;
- struct xfs_defer_ops dfops;
- xfs_bmbt_irec_t map;
- int nmaps, error;
- xfs_buf_t *bp;
- xfs_trans_t *tp = *tpp;
-
- ASSERT(tp != NULL);
+ struct xfs_bmbt_irec map;
+ struct xfs_defer_ops dfops;
+ struct xfs_mount *mp = (*tpp)->t_mountp;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+ xfs_fsblock_t firstblock;
+ int nmaps = 1;
+ int error;
trace_xfs_dqalloc(dqp);
- /*
- * Initialize the bmap freelist prior to calling bmapi code.
- */
xfs_defer_init(&dfops, &firstblock);
xfs_ilock(quotip, XFS_ILOCK_EXCL);
- /*
- * Return if this type of quotas is turned off while we didn't
- * have an inode lock
- */
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ /*
+ * Return if this type of quotas is turned off while we didn't
+ * have an inode lock
+ */
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return -ESRCH;
}
- xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
- nmaps = 1;
- error = xfs_bmapi_write(tp, quotip, offset_fsb,
- XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
- &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
- &map, &nmaps, &dfops);
+ /* Create the block mapping. */
+ xfs_trans_ijoin(*tpp, quotip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_write(*tpp, quotip, dqp->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+ &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+ &map, &nmaps, &dfops);
if (error)
goto error0;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -344,10 +338,8 @@ xfs_qm_dqalloc(
dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
/* now we can just get the buffer (there's nothing to read yet) */
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen,
- 0);
+ bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0);
if (!bp) {
error = -ENOMEM;
goto error1;
@@ -358,37 +350,45 @@ xfs_qm_dqalloc(
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+ xfs_qm_init_dquot_blk(*tpp, mp, be32_to_cpu(dqp->q_core.d_id),
dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
- * xfs_defer_finish() may commit the current transaction and
- * start a second transaction if the freelist is not empty.
+ * Hold the buffer and join it to the dfops so that we'll still own
+ * the buffer when we return to the caller. The buffer disposal on
+ * error must be paid attention to very carefully, as it has been
+ * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota
+ * code when allocating a new dquot record" in 2005, and the later
+ * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep
+ * the buffer locked across the _defer_finish call. We can now do
+ * this correctly with xfs_defer_bjoin.
*
- * Since we still want to modify this buffer, we need to
- * ensure that the buffer is not released on commit of
- * the first transaction and ensure the buffer is added to the
- * second transaction.
+ * Above, we allocated a disk block for the dquot information and
+ * used get_buf to initialize the dquot. If the _defer_bjoin fails,
+ * the buffer is still locked to *tpp, so we must _bhold_release and
+ * then _trans_brelse the buffer. If the _defer_finish fails, the old
+ * transaction is gone but the new buffer is not joined or held to any
+ * transaction, so we must _buf_relse it.
*
- * If there is only one transaction then don't stop the buffer
- * from being released when it commits later on.
+ * If everything succeeds, the caller of this function is returned a
+ * buffer that is locked and held to the transaction. The caller
+ * is responsible for unlocking any buffer passed back, either
+ * manually or by committing the transaction.
*/
-
- xfs_trans_bhold(tp, bp);
-
+ xfs_trans_bhold(*tpp, bp);
+ error = xfs_defer_bjoin(&dfops, bp);
+ if (error) {
+ xfs_trans_bhold_release(*tpp, bp);
+ xfs_trans_brelse(*tpp, bp);
+ goto error1;
+ }
error = xfs_defer_finish(tpp, &dfops);
- if (error)
+ if (error) {
+ xfs_buf_relse(bp);
goto error1;
-
- /* Transaction was committed? */
- if (*tpp != tp) {
- tp = *tpp;
- xfs_trans_bjoin(tp, bp);
- } else {
- xfs_trans_bhold_release(tp, bp);
}
-
- *O_bpp = bp;
+ *bpp = bp;
return 0;
error1:
@@ -398,32 +398,24 @@ error0:
}
/*
- * Maps a dquot to the buffer containing its on-disk version.
- * This returns a ptr to the buffer containing the on-disk dquot
- * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ * Read in the in-core dquot's on-disk metadata and return the buffer.
+ * Returns ENOENT to signal a hole.
*/
STATIC int
-xfs_qm_dqtobp(
- xfs_trans_t **tpp,
- xfs_dquot_t *dqp,
- xfs_disk_dquot_t **O_ddpp,
- xfs_buf_t **O_bpp,
- uint flags)
+xfs_dquot_disk_read(
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
{
struct xfs_bmbt_irec map;
- int nmaps = 1, error;
struct xfs_buf *bp;
- struct xfs_inode *quotip;
- struct xfs_mount *mp = dqp->q_mount;
- xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
- struct xfs_trans *tp = (tpp ? *tpp : NULL);
+ struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
uint lock_mode;
-
- quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
- dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+ int nmaps = 1;
+ int error;
lock_mode = xfs_ilock_data_map_shared(quotip);
- if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
@@ -436,81 +428,48 @@ xfs_qm_dqtobp(
* Find the block map; no allocations yet
*/
error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
- XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
-
+ XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
xfs_iunlock(quotip, lock_mode);
if (error)
return error;
ASSERT(nmaps == 1);
- ASSERT(map.br_blockcount == 1);
+ ASSERT(map.br_blockcount >= 1);
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ if (map.br_startblock == HOLESTARTBLOCK)
+ return -ENOENT;
+
+ trace_xfs_dqtobp_read(dqp);
/*
- * Offset of dquot in the (fixed sized) dquot chunk.
+ * store the blkno etc so that we don't have to do the
+ * mapping all the time
*/
- dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
- sizeof(xfs_dqblk_t);
-
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
- if (map.br_startblock == HOLESTARTBLOCK) {
- /*
- * We don't allocate unless we're asked to
- */
- if (!(flags & XFS_QMOPT_DQALLOC))
- return -ENOENT;
-
- ASSERT(tp);
- error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
- dqp->q_fileoffset, &bp);
- if (error)
- return error;
- tp = *tpp;
- } else {
- trace_xfs_dqtobp_read(dqp);
+ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
- /*
- * store the blkno etc so that we don't have to do the
- * mapping all the time
- */
- dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen,
- 0, &bp, &xfs_dquot_buf_ops);
- if (error) {
- ASSERT(bp == NULL);
- return error;
- }
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ &xfs_dquot_buf_ops);
+ if (error) {
+ ASSERT(bp == NULL);
+ return error;
}
ASSERT(xfs_buf_islocked(bp));
- *O_bpp = bp;
- *O_ddpp = bp->b_addr + dqp->q_bufoffset;
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+ *bpp = bp;
return 0;
}
-
-/*
- * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
- */
-int
-xfs_qm_dqread(
+/* Allocate and initialize everything we need for an incore dquot. */
+STATIC struct xfs_dquot *
+xfs_dquot_alloc(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
- uint flags,
- struct xfs_dquot **O_dqpp)
+ uint type)
{
struct xfs_dquot *dqp;
- struct xfs_disk_dquot *ddqp;
- struct xfs_buf *bp;
- struct xfs_trans *tp = NULL;
- int error;
dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
@@ -520,6 +479,12 @@ xfs_qm_dqread(
INIT_LIST_HEAD(&dqp->q_lru);
mutex_init(&dqp->q_qlock);
init_waitqueue_head(&dqp->q_pinwait);
+ dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+ /*
+ * Offset of dquot in the (fixed sized) dquot chunk.
+ */
+ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+ sizeof(xfs_dqblk_t);
/*
* Because we want to use a counting completion, complete
@@ -548,35 +513,22 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(mp, xs_qm_dquot);
-
- trace_xfs_dqread(dqp);
+ xfs_qm_dquot_logitem_init(dqp);
- if (flags & XFS_QMOPT_DQALLOC) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
- XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
- if (error)
- goto error0;
- }
+ XFS_STATS_INC(mp, xs_qm_dquot);
+ return dqp;
+}
- /*
- * get a pointer to the on-disk dquot and the buffer containing it
- * dqp already knows its own type (GROUP/USER).
- */
- error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
- if (error) {
- /*
- * This can happen if quotas got turned off (ESRCH),
- * or if the dquot didn't exist on disk and we ask to
- * allocate (ENOENT).
- */
- trace_xfs_dqread_fail(dqp);
- goto error1;
- }
+/* Copy the in-core quota fields in from the on-disk buffer. */
+STATIC void
+xfs_dquot_from_disk(
+ struct xfs_dquot *dqp,
+ struct xfs_buf *bp)
+{
+ struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
/* copy everything from disk dquot to the incore dquot */
memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
- xfs_qm_dquot_logitem_init(dqp);
/*
* Reservation counters are defined as reservation plus current usage
@@ -588,40 +540,90 @@ xfs_qm_dqread(
/* initialize the dquot speculative prealloc thresholds */
xfs_dquot_set_prealloc_limits(dqp);
+}
- /* Mark the buf so that this will stay incore a little longer */
- xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+/* Allocate and initialize the dquot buffer for this in-core dquot. */
+static int
+xfs_qm_dqread_alloc(
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
+ if (error)
+ goto err;
+
+ error = xfs_dquot_disk_alloc(&tp, dqp, &bp);
+ if (error)
+ goto err_cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error) {
+ /*
+ * Buffer was held to the transaction, so we have to unlock it
+ * manually here because we're not passing it back.
+ */
+ xfs_buf_relse(bp);
+ goto err;
+ }
+ *bpp = bp;
+ return 0;
+
+err_cancel:
+ xfs_trans_cancel(tp);
+err:
+ return error;
+}
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately. If @can_alloc is true, fill any
+ * holes in the on-disk metadata.
+ */
+static int
+xfs_qm_dqread(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_dquot *dqp;
+ struct xfs_buf *bp;
+ int error;
+
+ dqp = xfs_dquot_alloc(mp, id, type);
+ trace_xfs_dqread(dqp);
+
+ /* Try to read the buffer, allocating if necessary. */
+ error = xfs_dquot_disk_read(mp, dqp, &bp);
+ if (error == -ENOENT && can_alloc)
+ error = xfs_qm_dqread_alloc(mp, dqp, &bp);
+ if (error)
+ goto err;
/*
- * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
- * So we need to release with xfs_trans_brelse().
- * The strategy here is identical to that of inodes; we lock
- * the dquot in xfs_qm_dqget() before making it accessible to
- * others. This is because dquots, like inodes, need a good level of
- * concurrency, and we don't want to take locks on the entire buffers
- * for dquot accesses.
- * Note also that the dquot buffer may even be dirty at this point, if
- * this particular dquot was repaired. We still aren't afraid to
- * brelse it because we have the changes incore.
+ * At this point we should have a clean locked buffer. Copy the data
+ * to the incore dquot and release the buffer since the incore dquot
+ * has its own locking protocol so we needn't tie up the buffer any
+ * further.
*/
ASSERT(xfs_buf_islocked(bp));
- xfs_trans_brelse(tp, bp);
+ xfs_dquot_from_disk(dqp, bp);
- if (tp) {
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
- }
-
- *O_dqpp = dqp;
+ xfs_buf_relse(bp);
+ *dqpp = dqp;
return error;
-error1:
- if (tp)
- xfs_trans_cancel(tp);
-error0:
+err:
+ trace_xfs_dqread_fail(dqp);
xfs_qm_dqdestroy(dqp);
- *O_dqpp = NULL;
+ *dqpp = NULL;
return error;
}
@@ -679,77 +681,230 @@ xfs_dq_get_next_id(
}
/*
- * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
- * a locked dquot, doing an allocation (if requested) as needed.
- * When both an inode and an id are given, the inode's id takes precedence.
- * That is, if the id changes while we don't hold the ilock inside this
- * function, the new dquot is returned, not necessarily the one requested
- * in the id argument.
+ * Look up the dquot in the in-core cache. If found, the dquot is returned
+ * locked and ready to go.
+ */
+static struct xfs_dquot *
+xfs_qm_dqget_cache_lookup(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qi,
+ struct radix_tree_root *tree,
+ xfs_dqid_t id)
+{
+ struct xfs_dquot *dqp;
+
+restart:
+ mutex_lock(&qi->qi_tree_lock);
+ dqp = radix_tree_lookup(tree, id);
+ if (!dqp) {
+ mutex_unlock(&qi->qi_tree_lock);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
+ return NULL;
+ }
+
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_freeing(dqp);
+ delay(1);
+ goto restart;
+ }
+
+ dqp->q_nrefs++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xfs_dqget_hit(dqp);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
+ return dqp;
+}
+
+/*
+ * Try to insert a new dquot into the in-core cache. If an error occurs the
+ * caller should throw away the dquot and start over. Otherwise, the dquot
+ * is returned locked (and held by the cache) as if there had been a cache
+ * hit.
+ */
+static int
+xfs_qm_dqget_cache_insert(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qi,
+ struct radix_tree_root *tree,
+ xfs_dqid_t id,
+ struct xfs_dquot *dqp)
+{
+ int error;
+
+ mutex_lock(&qi->qi_tree_lock);
+ error = radix_tree_insert(tree, id, dqp);
+ if (unlikely(error)) {
+ /* Duplicate found! Caller must try again. */
+ WARN_ON(error != -EEXIST);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_dup(dqp);
+ return error;
+ }
+
+ /* Return a locked dquot to the caller, with a reference taken. */
+ xfs_dqlock(dqp);
+ dqp->q_nrefs = 1;
+
+ qi->qi_dquots++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ return 0;
+}
+
+/* Check our input parameters. */
+static int
+xfs_qm_dqget_checks(
+ struct xfs_mount *mp,
+ uint type)
+{
+ if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
+ return -ESRCH;
+
+ switch (type) {
+ case XFS_DQ_USER:
+ if (!XFS_IS_UQUOTA_ON(mp))
+ return -ESRCH;
+ return 0;
+ case XFS_DQ_GROUP:
+ if (!XFS_IS_GQUOTA_ON(mp))
+ return -ESRCH;
+ return 0;
+ case XFS_DQ_PROJ:
+ if (!XFS_IS_PQUOTA_ON(mp))
+ return -ESRCH;
+ return 0;
+ default:
+ WARN_ON_ONCE(0);
+ return -EINVAL;
+ }
+}
+
+/*
+ * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
+ * dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
- xfs_mount_t *mp,
- xfs_inode_t *ip, /* locked inode (optional) */
- xfs_dqid_t id, /* uid/projid/gid depending on type */
- uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
- uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
- xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **O_dqpp)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
- (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
- (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
- return -ESRCH;
+ error = xfs_qm_dqget_checks(mp, type);
+ if (error)
+ return error;
+
+restart:
+ dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+ if (dqp) {
+ *O_dqpp = dqp;
+ return 0;
}
- ASSERT(type == XFS_DQ_USER ||
- type == XFS_DQ_PROJ ||
- type == XFS_DQ_GROUP);
- if (ip) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_inode_dquot(ip, type) == NULL);
+ error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ if (error)
+ return error;
+
+ error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+ if (error) {
+ /*
+ * Duplicate found. Just throw away the new dquot and start
+ * over.
+ */
+ xfs_qm_dqdestroy(dqp);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
+ goto restart;
}
-restart:
- mutex_lock(&qi->qi_tree_lock);
- dqp = radix_tree_lookup(tree, id);
- if (dqp) {
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
- xfs_dqunlock(dqp);
- mutex_unlock(&qi->qi_tree_lock);
- trace_xfs_dqget_freeing(dqp);
- delay(1);
- goto restart;
- }
+ trace_xfs_dqget_miss(dqp);
+ *O_dqpp = dqp;
+ return 0;
+}
- /* uninit / unused quota found in radix tree, keep looking */
- if (flags & XFS_QMOPT_DQNEXT) {
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_dqunlock(dqp);
- mutex_unlock(&qi->qi_tree_lock);
- error = xfs_dq_get_next_id(mp, type, &id);
- if (error)
- return error;
- goto restart;
- }
- }
+/*
+ * Given a dquot id and type, read and initialize a dquot from the on-disk
+ * metadata. This function is only for use during quota initialization so
+ * it ignores the dquot cache assuming that the dquot shrinker isn't set up.
+ * The caller is responsible for _qm_dqdestroy'ing the returned dquot.
+ */
+int
+xfs_qm_dqget_uncached(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct xfs_dquot **dqpp)
+{
+ int error;
- dqp->q_nrefs++;
- mutex_unlock(&qi->qi_tree_lock);
+ error = xfs_qm_dqget_checks(mp, type);
+ if (error)
+ return error;
+
+ return xfs_qm_dqread(mp, id, type, 0, dqpp);
+}
+
+/* Return the quota id for a given inode and type. */
+xfs_dqid_t
+xfs_qm_id_for_quotatype(
+ struct xfs_inode *ip,
+ uint type)
+{
+ switch (type) {
+ case XFS_DQ_USER:
+ return ip->i_d.di_uid;
+ case XFS_DQ_GROUP:
+ return ip->i_d.di_gid;
+ case XFS_DQ_PROJ:
+ return xfs_get_projid(ip);
+ }
+ ASSERT(0);
+ return 0;
+}
+
+/*
+ * Return the dquot for a given inode and type. If @can_alloc is true, then
+ * allocate blocks if needed. The inode's ILOCK must be held and it must not
+ * have already had an inode attached.
+ */
+int
+xfs_qm_dqget_inode(
+ struct xfs_inode *ip,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **O_dqpp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+ struct xfs_dquot *dqp;
+ xfs_dqid_t id;
+ int error;
- trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(mp, xs_qm_dqcachehits);
+ error = xfs_qm_dqget_checks(mp, type);
+ if (error)
+ return error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_inode_dquot(ip, type) == NULL);
+
+ id = xfs_qm_id_for_quotatype(ip, type);
+
+restart:
+ dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+ if (dqp) {
*O_dqpp = dqp;
return 0;
}
- mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -758,87 +913,81 @@ restart:
* lock here means dealing with a chown that can happen before
* we re-acquire the lock.
*/
- if (ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- error = xfs_qm_dqread(mp, id, type, flags, &dqp);
-
- if (ip)
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /* If we are asked to find next active id, keep looking */
- if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
- error = xfs_dq_get_next_id(mp, type, &id);
- if (!error)
- goto restart;
- }
-
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
- if (ip) {
- /*
- * A dquot could be attached to this inode by now, since
- * we had dropped the ilock.
- */
- if (xfs_this_quota_on(mp, type)) {
- struct xfs_dquot *dqp1;
-
- dqp1 = xfs_inode_dquot(ip, type);
- if (dqp1) {
- xfs_qm_dqdestroy(dqp);
- dqp = dqp1;
- xfs_dqlock(dqp);
- goto dqret;
- }
- } else {
- /* inode stays locked on return */
+ /*
+ * A dquot could be attached to this inode by now, since we had
+ * dropped the ilock.
+ */
+ if (xfs_this_quota_on(mp, type)) {
+ struct xfs_dquot *dqp1;
+
+ dqp1 = xfs_inode_dquot(ip, type);
+ if (dqp1) {
xfs_qm_dqdestroy(dqp);
- return -ESRCH;
+ dqp = dqp1;
+ xfs_dqlock(dqp);
+ goto dqret;
}
+ } else {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return -ESRCH;
}
- mutex_lock(&qi->qi_tree_lock);
- error = radix_tree_insert(tree, id, dqp);
- if (unlikely(error)) {
- WARN_ON(error != -EEXIST);
-
+ error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+ if (error) {
/*
* Duplicate found. Just throw away the new dquot and start
* over.
*/
- mutex_unlock(&qi->qi_tree_lock);
- trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
- /*
- * We return a locked dquot to the caller, with a reference taken
- */
- xfs_dqlock(dqp);
- dqp->q_nrefs = 1;
+dqret:
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ trace_xfs_dqget_miss(dqp);
+ *O_dqpp = dqp;
+ return 0;
+}
- qi->qi_dquots++;
- mutex_unlock(&qi->qi_tree_lock);
+/*
+ * Starting at @id and progressing upwards, look for an initialized incore
+ * dquot, lock it, and return it.
+ */
+int
+xfs_qm_dqget_next(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_dquot *dqp;
+ int error = 0;
- /* If we are asked to find next active id, keep looking */
- if (flags & XFS_QMOPT_DQNEXT) {
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_qm_dqput(dqp);
- error = xfs_dq_get_next_id(mp, type, &id);
- if (error)
- return error;
- goto restart;
+ *dqpp = NULL;
+ for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
+ error = xfs_qm_dqget(mp, id, type, false, &dqp);
+ if (error == -ENOENT)
+ continue;
+ else if (error != 0)
+ break;
+
+ if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ *dqpp = dqp;
+ return 0;
}
+
+ xfs_qm_dqput(dqp);
}
- dqret:
- ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
- trace_xfs_dqget_miss(dqp);
- *O_dqpp = dqp;
- return 0;
+ return error;
}
/*
@@ -913,9 +1062,9 @@ xfs_qm_dqflush_done(
* since it's cheaper, and then we recheck while
* holding the lock before removing the dquot from the AIL.
*/
- if ((lip->li_flags & XFS_LI_IN_AIL) &&
+ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
((lip->li_lsn == qip->qli_flush_lsn) ||
- (lip->li_flags & XFS_LI_FAILED))) {
+ test_bit(XFS_LI_FAILED, &lip->li_flags))) {
/* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->ail_lock);
@@ -926,8 +1075,7 @@ xfs_qm_dqflush_done(
* Clear the failed state since we are about to drop the
* flush lock
*/
- if (lip->li_flags & XFS_LI_FAILED)
- xfs_clear_li_failed(lip);
+ xfs_clear_li_failed(lip);
spin_unlock(&ailp->ail_lock);
}
}
@@ -953,6 +1101,7 @@ xfs_qm_dqflush(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_buf *bp;
+ struct xfs_dqblk *dqb;
struct xfs_disk_dquot *ddqp;
xfs_failaddr_t fa;
int error;
@@ -996,12 +1145,13 @@ xfs_qm_dqflush(
/*
* Calculate the location of the dquot inside the buffer.
*/
- ddqp = bp->b_addr + dqp->q_bufoffset;
+ dqb = bp->b_addr + dqp->q_bufoffset;
+ ddqp = &dqb->dd_diskdq;
/*
- * A simple sanity check in case we got a corrupted dquot..
+ * A simple sanity check in case we got a corrupted dquot.
*/
- fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0);
+ fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
be32_to_cpu(ddqp->d_id), fa);
@@ -1032,8 +1182,6 @@ xfs_qm_dqflush(
* of a dquot without an up-to-date CRC getting to disk.
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
-
dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -1119,3 +1267,35 @@ xfs_qm_exit(void)
kmem_zone_destroy(xfs_qm_dqtrxzone);
kmem_zone_destroy(xfs_qm_dqzone);
}
+
+/*
+ * Iterate every dquot of a particular type. The caller must ensure that the
+ * particular quota type is active. iter_fn can return negative error codes,
+ * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating.
+ */
+int
+xfs_qm_dqiterate(
+ struct xfs_mount *mp,
+ uint dqtype,
+ xfs_qm_dqiterate_fn iter_fn,
+ void *priv)
+{
+ struct xfs_dquot *dq;
+ xfs_dqid_t id = 0;
+ int error;
+
+ do {
+ error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
+ if (error == -ENOENT)
+ return 0;
+ if (error)
+ return error;
+
+ error = iter_fn(dq, dqtype, priv);
+ id = be32_to_cpu(dq->q_core.d_id);
+ xfs_qm_dqput(dq);
+ id++;
+ } while (error == 0 && id != 0);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 2f536f33cd26..bdd6bd921528 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -160,8 +160,6 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
- uint, struct xfs_dquot **);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
@@ -169,8 +167,19 @@ extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
struct xfs_dquot *);
-extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
- xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+ uint type);
+extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+ uint type, bool can_alloc,
+ struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
+ bool can_alloc,
+ struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+ uint type, struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_uncached(struct xfs_mount *mp,
+ xfs_dqid_t id, uint type,
+ struct xfs_dquot **dqpp);
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
@@ -185,4 +194,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype,
+ void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype,
+ xfs_qm_dqiterate_fn iter_fn, void *priv);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 4b331e354da7..8eb7415474d6 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -173,7 +173,7 @@ xfs_qm_dquot_logitem_push(
* The buffer containing this item failed to be written back
* previously. Resubmit the buffer for IO
*/
- if (lip->li_flags & XFS_LI_FAILED) {
+ if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
@@ -209,10 +209,7 @@ xfs_qm_dquot_logitem_push(
spin_unlock(&lip->li_ailp->ail_lock);
error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT,
- __func__, error, dqp);
- } else {
+ if (!error) {
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index a63f5083f497..7975634cb8fe 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
XFS_RANDOM_BUF_LRU_REF,
+ XFS_RANDOM_FORCE_SCRUB_REPAIR,
};
struct xfs_errortag_attr {
@@ -167,6 +168,7 @@ XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
+XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -201,6 +203,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
+ XFS_ERRORTAG_ATTR_LIST(force_repair),
NULL,
};
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 64da90655e95..a889b550979a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -51,6 +51,24 @@ xfs_efi_item_free(
}
/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
+ */
+void
+xfs_efi_release(
+ struct xfs_efi_log_item *efip)
+{
+ ASSERT(atomic_read(&efip->efi_refcount) > 0);
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
+ xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_efi_item_free(efip);
+ }
+}
+
+/*
* This returns the number of iovecs needed to log the given efi item.
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
@@ -150,8 +168,8 @@ STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_efi_item_free(EFI_ITEM(lip));
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
+ xfs_efi_release(EFI_ITEM(lip));
}
/*
@@ -279,24 +297,6 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
return -EFSCORRUPTED;
}
-/*
- * Freeing the efi requires that we remove it from the AIL if it has already
- * been placed there. However, the EFI may not yet have been placed in the AIL
- * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the EFI.
- */
-void
-xfs_efi_release(
- struct xfs_efi_log_item *efip)
-{
- ASSERT(atomic_read(&efip->efi_refcount) > 0);
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
-}
-
static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_efd_log_item, efd_item);
@@ -402,7 +402,7 @@ xfs_efd_item_unlock(
{
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_efi_release(efdp->efd_efip);
xfs_efd_item_free(efdp);
}
@@ -542,7 +542,7 @@ xfs_efi_recover(
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &efip->efi_format.efi_extents[i];
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len, &oinfo);
+ extp->ext_len, &oinfo, false);
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 299aee4b7b0b..0e3fb8978344 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -414,6 +414,12 @@ xfs_dio_write_end_io(
if (size <= 0)
return size;
+ /*
+ * Capture amount written on completion as we can't reliably account
+ * for it on submission.
+ */
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+
if (flags & IOMAP_DIO_COW) {
error = xfs_reflink_end_cow(ip, offset, size);
if (error)
@@ -599,7 +605,16 @@ xfs_file_dax_write(
}
out:
xfs_iunlock(ip, iolock);
- return error ? error : ret;
+ if (error)
+ return error;
+
+ if (ret > 0) {
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+
+ /* Handle various SYNC-type writes */
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
}
STATIC ssize_t
@@ -669,6 +684,12 @@ write_retry:
out:
if (iolock)
xfs_iunlock(ip, iolock);
+
+ if (ret > 0) {
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+ /* Handle various SYNC-type writes */
+ ret = generic_write_sync(iocb, ret);
+ }
return ret;
}
@@ -693,8 +714,9 @@ xfs_file_write_iter(
return -EIO;
if (IS_DAX(inode))
- ret = xfs_file_dax_write(iocb, from);
- else if (iocb->ki_flags & IOCB_DIRECT) {
+ return xfs_file_dax_write(iocb, from);
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
* write *only* in the case that we're doing a reflink
@@ -702,20 +724,11 @@ xfs_file_write_iter(
* allow an operation to fall back to buffered mode.
*/
ret = xfs_file_dio_aio_write(iocb, from);
- if (ret == -EREMCHG)
- goto buffered;
- } else {
-buffered:
- ret = xfs_file_buffered_aio_write(iocb, from);
+ if (ret != -EREMCHG)
+ return ret;
}
- if (ret > 0) {
- XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
-
- /* Handle various SYNC-type writes */
- ret = generic_write_sync(iocb, ret);
- }
- return ret;
+ return xfs_file_buffered_aio_write(iocb, from);
}
#define XFS_FALLOC_FL_SUPPORTED \
@@ -778,22 +791,26 @@ xfs_file_fallocate(
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_INSERT_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
+ unsigned int blksize_mask = i_blocksize(inode) - 1;
+ loff_t isize = i_size_read(inode);
- new_size = i_size_read(inode) + len;
if (offset & blksize_mask || len & blksize_mask) {
error = -EINVAL;
goto out_unlock;
}
- /* check the new inode size does not wrap through zero */
- if (new_size > inode->i_sb->s_maxbytes) {
+ /*
+ * New inode size must not exceed ->s_maxbytes, accounting for
+ * possible signed overflow.
+ */
+ if (inode->i_sb->s_maxbytes - isize < len) {
error = -EFBIG;
goto out_unlock;
}
+ new_size = isize + len;
/* Offset should be less than i_size */
- if (offset >= i_size_read(inode)) {
+ if (offset >= isize) {
error = -EINVAL;
goto out_unlock;
}
@@ -876,8 +893,18 @@ xfs_file_dedupe_range(
struct file *dst_file,
u64 dst_loff)
{
+ struct inode *srci = file_inode(src_file);
+ u64 max_dedupe;
int error;
+ /*
+ * Since we have to read all these pages in to compare them, cut
+ * it off at MAX_RW_COUNT/2 rounded down to the nearest block.
+ * That means we won't do more than MAX_RW_COUNT IO per request.
+ */
+ max_dedupe = (MAX_RW_COUNT >> 1) & ~(i_blocksize(srci) - 1);
+ if (len > max_dedupe)
+ len = max_dedupe;
error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
len, true);
if (error)
@@ -993,7 +1020,7 @@ xfs_file_llseek(
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
-static int
+static vm_fault_t
__xfs_filemap_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size,
@@ -1001,7 +1028,7 @@ __xfs_filemap_fault(
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
- int ret;
+ vm_fault_t ret;
trace_xfs_filemap_fault(ip, pe_size, write_fault);
@@ -1030,7 +1057,7 @@ __xfs_filemap_fault(
return ret;
}
-static int
+static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
{
@@ -1040,7 +1067,7 @@ xfs_filemap_fault(
(vmf->flags & FAULT_FLAG_WRITE));
}
-static int
+static vm_fault_t
xfs_filemap_huge_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size)
@@ -1053,7 +1080,7 @@ xfs_filemap_huge_fault(
(vmf->flags & FAULT_FLAG_WRITE));
}
-static int
+static vm_fault_t
xfs_filemap_page_mkwrite(
struct vm_fault *vmf)
{
@@ -1065,7 +1092,7 @@ xfs_filemap_page_mkwrite(
* on write faults. In reality, it needs to serialise against truncate and
* prepare memory for writing so handle is as standard write fault.
*/
-static int
+static vm_fault_t
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 043ca3808ea2..3f8722e51dbe 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -34,7 +34,6 @@
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
- struct xfs_inode *ip;
xfs_agnumber_t ag; /* AG in use for this directory */
};
@@ -122,14 +121,15 @@ xfs_filestream_put_ag(
static void
xfs_fstrm_free_func(
+ void *data,
struct xfs_mru_cache_elem *mru)
{
+ struct xfs_mount *mp = data;
struct xfs_fstrm_item *item =
container_of(mru, struct xfs_fstrm_item, mru);
- xfs_filestream_put_ag(item->ip->i_mount, item->ag);
-
- trace_xfs_filestream_free(item->ip, item->ag);
+ xfs_filestream_put_ag(mp, item->ag);
+ trace_xfs_filestream_free(mp, mru->key, item->ag);
kmem_free(item);
}
@@ -165,7 +165,7 @@ xfs_filestream_pick_ag(
trylock = XFS_ALLOC_FLAG_TRYLOCK;
for (nscan = 0; 1; nscan++) {
- trace_xfs_filestream_scan(ip, ag);
+ trace_xfs_filestream_scan(mp, ip->i_ino, ag);
pag = xfs_perag_get(mp, ag);
@@ -198,7 +198,7 @@ xfs_filestream_pick_ag(
goto next_ag;
}
- longest = xfs_alloc_longest_free_extent(mp, pag,
+ longest = xfs_alloc_longest_free_extent(pag,
xfs_alloc_min_freelist(mp, pag),
xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (((minlen && longest >= minlen) ||
@@ -265,7 +265,6 @@ next_ag:
goto out_put_ag;
item->ag = *agp;
- item->ip = ip;
err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
if (err) {
@@ -333,7 +332,7 @@ xfs_filestream_lookup_ag(
ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
xfs_mru_cache_done(mp->m_filestream);
- trace_xfs_filestream_lookup(ip, ag);
+ trace_xfs_filestream_lookup(mp, ip->i_ino, ag);
goto out;
}
@@ -399,7 +398,7 @@ xfs_filestream_new_ag(
* Only free the item here so we skip over the old AG earlier.
*/
if (mru)
- xfs_fstrm_free_func(mru);
+ xfs_fstrm_free_func(mp, mru);
IRELE(pip);
exit:
@@ -426,8 +425,8 @@ xfs_filestream_mount(
* timer tunable to within about 10 percent. This requires at least 10
* groups.
*/
- return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
- 10, xfs_fstrm_free_func);
+ return xfs_mru_cache_create(&mp->m_filestream, mp,
+ xfs_fstrm_centisecs * 10, 10, xfs_fstrm_free_func);
}
void
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 43cfc07996a4..0299febece9c 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -465,10 +465,9 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
- rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-
- irec.rm_startblock = rec->ar_startblock;
- irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock);
+ irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
irec.rm_offset = 0;
irec.rm_flags = 0;
@@ -534,8 +533,11 @@ xfs_getfsmap_rtdev_rtbitmap_query(
xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
- alow.ar_startblock = info->low.rm_startblock;
- ahigh.ar_startblock = info->high.rm_startblock;
+ alow.ar_startext = info->low.rm_startblock;
+ ahigh.ar_startext = info->high.rm_startblock;
+ do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize);
+ if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize))
+ ahigh.ar_startext++;
error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 523792768080..bc7ef18da243 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,85 +24,42 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_error.h"
#include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
-#include "xfs_rmap_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_fsops.h"
-#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
#include "xfs_trace.h"
#include "xfs_log.h"
-#include "xfs_filestream.h"
-#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
- * File system operations
+ * growfs operations
*/
-
-static struct xfs_buf *
-xfs_growfs_get_hdr_buf(
- struct xfs_mount *mp,
- xfs_daddr_t blkno,
- size_t numblks,
- int flags,
- const struct xfs_buf_ops *ops)
-{
- struct xfs_buf *bp;
-
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
- if (!bp)
- return NULL;
-
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- bp->b_bn = blkno;
- bp->b_maps[0].bm_bn = blkno;
- bp->b_ops = ops;
-
- return bp;
-}
-
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_data_t *in) /* growfs data input struct */
{
- xfs_agf_t *agf;
- struct xfs_agfl *agfl;
- xfs_agi_t *agi;
- xfs_agnumber_t agno;
- xfs_extlen_t agsize;
- xfs_extlen_t tmpsize;
- xfs_alloc_rec_t *arec;
xfs_buf_t *bp;
- int bucket;
- int dpct;
- int error, saved_error = 0;
+ int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
xfs_rfsblock_t new;
- xfs_rfsblock_t nfree;
xfs_agnumber_t oagcount;
- int pct;
xfs_trans_t *tp;
+ LIST_HEAD (buffer_list);
+ struct aghdr_init_data id = {};
nb = in->newblocks;
- pct = in->imaxpct;
- if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+ if (nb < mp->m_sb.sb_dblocks)
return -EINVAL;
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
- dpct = pct - mp->m_sb.sb_imax_pct;
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
@@ -135,376 +92,45 @@ xfs_growfs_data_private(
return error;
/*
- * Write new AG headers to disk. Non-transactional, but written
- * synchronously so they are completed prior to the growfs transaction
- * being logged.
+ * Write new AG headers to disk. Non-transactional, but need to be
+ * written and completed prior to the growfs transaction being logged.
+ * To do this, we use a delayed write buffer list and wait for
+ * submission and IO completion of the list as a whole. This allows the
+ * IO subsystem to merge all the AG headers in a single AG into a single
+ * IO and hide most of the latency of the IO from us.
+ *
+ * This also means that if we get an error whilst building the buffer
+ * list to write, we can cancel the entire list without having written
+ * anything.
*/
- nfree = 0;
- for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
- __be32 *agfl_bno;
-
- /*
- * AG freespace header block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0,
- &xfs_agf_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- agf = XFS_BUF_TO_AGF(bp);
- agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
- agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(agno);
- if (agno == nagcount - 1)
- agsize =
- nb -
- (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+ INIT_LIST_HEAD(&id.buffer_list);
+ for (id.agno = nagcount - 1;
+ id.agno >= oagcount;
+ id.agno--, new -= id.agsize) {
+
+ if (id.agno == nagcount - 1)
+ id.agsize = nb -
+ (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
else
- agsize = mp->m_sb.sb_agblocks;
- agf->agf_length = cpu_to_be32(agsize);
- agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
- agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
- agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(XFS_RMAP_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
- agf->agf_rmap_blocks = cpu_to_be32(1);
- }
-
- agf->agf_flfirst = cpu_to_be32(1);
- agf->agf_fllast = 0;
- agf->agf_flcount = 0;
- tmpsize = agsize - mp->m_ag_prealloc_blocks;
- agf->agf_freeblks = cpu_to_be32(tmpsize);
- agf->agf_longest = cpu_to_be32(tmpsize);
- if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- agf->agf_refcount_root = cpu_to_be32(
- xfs_refc_block(mp));
- agf->agf_refcount_level = cpu_to_be32(1);
- agf->agf_refcount_blocks = cpu_to_be32(1);
- }
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * AG freelist header block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0,
- &xfs_agfl_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- agfl = XFS_BUF_TO_AGFL(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
- agfl->agfl_seqno = cpu_to_be32(agno);
- uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
- }
-
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
- for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
- agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * AG inode header block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0,
- &xfs_agi_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- agi = XFS_BUF_TO_AGI(bp);
- agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
- agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(agno);
- agi->agi_length = cpu_to_be32(agsize);
- agi->agi_count = 0;
- agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
- agi->agi_level = cpu_to_be32(1);
- agi->agi_freecount = 0;
- agi->agi_newino = cpu_to_be32(NULLAGINO);
- agi->agi_dirino = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
- agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
- agi->agi_free_level = cpu_to_be32(1);
- }
- for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
- agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
+ id.agsize = mp->m_sb.sb_agblocks;
- /*
- * BNO btree root block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_allocbt_buf_ops);
-
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
-
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(
- agsize - be32_to_cpu(arec->ar_startblock));
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * CNT btree root block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_allocbt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
-
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(
- agsize - be32_to_cpu(arec->ar_startblock));
- nfree += be32_to_cpu(arec->ar_blockcount);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /* RMAP btree root block */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
- struct xfs_rmap_rec *rrec;
- struct xfs_btree_block *block;
-
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_rmapbt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
- agno, 0);
- block = XFS_BUF_TO_BLOCK(bp);
-
-
- /*
- * mark the AG header regions as static metadata The BNO
- * btree block is the first block after the headers, so
- * it's location defines the size of region the static
- * metadata consumes.
- *
- * Note: unlike mkfs, we never have to account for log
- * space when growing the data regions
- */
- rrec = XFS_RMAP_REC_ADDR(block, 1);
- rrec->rm_startblock = 0;
- rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account freespace btree root blocks */
- rrec = XFS_RMAP_REC_ADDR(block, 2);
- rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
- rrec->rm_blockcount = cpu_to_be32(2);
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account inode btree root blocks */
- rrec = XFS_RMAP_REC_ADDR(block, 3);
- rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
- rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
- XFS_IBT_BLOCK(mp));
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account for rmap btree root */
- rrec = XFS_RMAP_REC_ADDR(block, 4);
- rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
- rrec->rm_blockcount = cpu_to_be32(1);
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account for refc btree root */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- rrec = XFS_RMAP_REC_ADDR(block, 5);
- rrec->rm_startblock = cpu_to_be32(
- xfs_refc_block(mp));
- rrec->rm_blockcount = cpu_to_be32(1);
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
- }
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
- }
-
- /*
- * INO btree root block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_inobt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * FINO btree root block
- */
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_inobt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
- 0, 0, agno, 0);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
- }
-
- /*
- * refcount btree root block
- */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_refcountbt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
- 0, 0, agno, 0);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
- }
- }
- xfs_trans_agblocks_delta(tp, nfree);
- /*
- * There are new blocks in the old last a.g.
- */
- if (new) {
- struct xfs_owner_info oinfo;
-
- /*
- * Change the agi length.
- */
- error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
- if (error) {
- goto error0;
- }
- ASSERT(bp);
- agi = XFS_BUF_TO_AGI(bp);
- be32_add_cpu(&agi->agi_length, new);
- ASSERT(nagcount == oagcount ||
- be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
- xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
- /*
- * Change agf length.
- */
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+ error = xfs_ag_init_headers(mp, &id);
if (error) {
- goto error0;
+ xfs_buf_delwri_cancel(&id.buffer_list);
+ goto out_trans_cancel;
}
- ASSERT(bp);
- agf = XFS_BUF_TO_AGF(bp);
- be32_add_cpu(&agf->agf_length, new);
- ASSERT(be32_to_cpu(agf->agf_length) ==
- be32_to_cpu(agi->agi_length));
+ }
+ error = xfs_buf_delwri_submit(&id.buffer_list);
+ if (error)
+ goto out_trans_cancel;
- xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+ xfs_trans_agblocks_delta(tp, id.nfree);
- /*
- * Free the new space.
- *
- * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
- * this doesn't actually exist in the rmap btree.
- */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
- error = xfs_rmap_free(tp, bp, agno,
- be32_to_cpu(agf->agf_length) - new,
- new, &oinfo);
- if (error)
- goto error0;
- error = xfs_free_extent(tp,
- XFS_AGB_TO_FSB(mp, agno,
- be32_to_cpu(agf->agf_length) - new),
- new, &oinfo, XFS_AG_RESV_NONE);
+ /* If there are new blocks in the old last AG, extend it. */
+ if (new) {
+ error = xfs_ag_extend_space(mp, tp, &id, new);
if (error)
- goto error0;
+ goto out_trans_cancel;
}
/*
@@ -517,10 +143,8 @@ xfs_growfs_data_private(
if (nb > mp->m_sb.sb_dblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
nb - mp->m_sb.sb_dblocks);
- if (nfree)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
- if (dpct)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+ if (id.nfree)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
if (error)
@@ -529,12 +153,6 @@ xfs_growfs_data_private(
/* New allocation groups fully initialized, so update mount struct */
if (nagimax)
mp->m_maxagi = nagimax;
- if (mp->m_sb.sb_imax_pct) {
- uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
- do_div(icount, 100);
- mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
- } else
- mp->m_maxicount = 0;
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
@@ -545,73 +163,24 @@ xfs_growfs_data_private(
if (new) {
struct xfs_perag *pag;
- pag = xfs_perag_get(mp, agno);
+ pag = xfs_perag_get(mp, id.agno);
error = xfs_ag_resv_free(pag);
xfs_perag_put(pag);
if (error)
- goto out;
+ return error;
}
- /* Reserve AG metadata blocks. */
+ /*
+ * Reserve AG metadata blocks. ENOSPC here does not mean there was a
+ * growfs failure, just that there still isn't space for new user data
+ * after the grow has been run.
+ */
error = xfs_fs_reserve_ag_blocks(mp);
- if (error && error != -ENOSPC)
- goto out;
-
- /* update secondary superblocks. */
- for (agno = 1; agno < nagcount; agno++) {
+ if (error == -ENOSPC)
error = 0;
- /*
- * new secondary superblocks need to be zeroed, not read from
- * disk as the contents of the new area we are growing into is
- * completely unknown.
- */
- if (agno < oagcount) {
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp,
- &xfs_sb_buf_ops);
- } else {
- bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
- if (bp) {
- bp->b_ops = &xfs_sb_buf_ops;
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- } else
- error = -ENOMEM;
- }
-
- /*
- * If we get an error reading or writing alternate superblocks,
- * continue. xfs_repair chooses the "best" superblock based
- * on most matches; if we break early, we'll leave more
- * superblocks un-updated than updated, and xfs_repair may
- * pick them over the properly-updated primary.
- */
- if (error) {
- xfs_warn(mp,
- "error %d reading secondary superblock for ag %d",
- error, agno);
- saved_error = error;
- continue;
- }
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error) {
- xfs_warn(mp,
- "write error %d updating secondary superblock for ag %d",
- error, agno);
- saved_error = error;
- continue;
- }
- }
-
- out:
- return saved_error ? saved_error : error;
+ return error;
- error0:
+out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
@@ -638,25 +207,71 @@ xfs_growfs_log_private(
return -ENOSYS;
}
+static int
+xfs_growfs_imaxpct(
+ struct xfs_mount *mp,
+ __u32 imaxpct)
+{
+ struct xfs_trans *tp;
+ int dpct;
+ int error;
+
+ if (imaxpct > 100)
+ return -EINVAL;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+ XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ dpct = imaxpct - mp->m_sb.sb_imax_pct;
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+}
+
/*
* protected versions of growfs function acquire and release locks on the mount
* point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
* XFS_IOC_FSGROWFSRT
*/
-
-
int
xfs_growfs_data(
- xfs_mount_t *mp,
- xfs_growfs_data_t *in)
+ struct xfs_mount *mp,
+ struct xfs_growfs_data *in)
{
- int error;
+ int error = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
- error = xfs_growfs_data_private(mp, in);
+
+ /* update imaxpct separately to the physical grow of the filesystem */
+ if (in->imaxpct != mp->m_sb.sb_imax_pct) {
+ error = xfs_growfs_imaxpct(mp, in->imaxpct);
+ if (error)
+ goto out_error;
+ }
+
+ if (in->newblocks != mp->m_sb.sb_dblocks) {
+ error = xfs_growfs_data_private(mp, in);
+ if (error)
+ goto out_error;
+ }
+
+ /* Post growfs calculations needed to reflect new state in operations */
+ if (mp->m_sb.sb_imax_pct) {
+ uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+ do_div(icount, 100);
+ mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+ } else
+ mp->m_maxicount = 0;
+
+ /* Update secondary superblocks now the physical grow has completed */
+ error = xfs_update_secondary_sbs(mp);
+
+out_error:
/*
* Increment the generation unconditionally, the error could be from
* updating the secondary superblocks, in which case the new size
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 3e1cc3001bcb..fdde17a2333c 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -47,6 +47,7 @@ xfs_param_t xfs_params = {
struct xfs_globals xfs_globals = {
.log_recovery_delay = 0, /* no delay by default */
+ .mount_delay = 0, /* no delay by default */
#ifdef XFS_ASSERT_FATAL
.bug_on_assert = true, /* assert failures BUG() */
#else
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9a18f69f6e96..164350d91efc 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -107,7 +107,8 @@ xfs_inode_free_callback(
xfs_idestroy_fork(ip, XFS_COW_FORK);
if (ip->i_itemp) {
- ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL,
+ &ip->i_itemp->ili_item.li_flags));
xfs_inode_item_destroy(ip);
ip->i_itemp = NULL;
}
@@ -309,6 +310,46 @@ xfs_reinit_inode(
}
/*
+ * If we are allocating a new inode, then check what was returned is
+ * actually a free, empty inode. If we are not allocating an inode,
+ * then check we didn't find a free inode.
+ *
+ * Returns:
+ * 0 if the inode free state matches the lookup context
+ * -ENOENT if the inode is free and we are not allocating
+ * -EFSCORRUPTED if there is any state mismatch at all
+ */
+static int
+xfs_iget_check_free_state(
+ struct xfs_inode *ip,
+ int flags)
+{
+ if (flags & XFS_IGET_CREATE) {
+ /* should be a free inode */
+ if (VFS_I(ip)->i_mode != 0) {
+ xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
+ ip->i_ino, VFS_I(ip)->i_mode);
+ return -EFSCORRUPTED;
+ }
+
+ if (ip->i_d.di_nblocks != 0) {
+ xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx has blocks allocated!",
+ ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+ }
+
+ /* should be an allocated inode */
+ if (VFS_I(ip)->i_mode == 0)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
* Check the validity of the inode we just found it the cache
*/
static int
@@ -357,12 +398,12 @@ xfs_iget_cache_hit(
}
/*
- * If lookup is racing with unlink return an error immediately.
+ * Check the inode free state is valid. This also detects lookup
+ * racing with unlinks.
*/
- if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = -ENOENT;
+ error = xfs_iget_check_free_state(ip, flags);
+ if (error)
goto out_error;
- }
/*
* If IRECLAIMABLE is set, we've torn down the VFS inode already.
@@ -485,29 +526,12 @@ xfs_iget_cache_miss(
/*
- * If we are allocating a new inode, then check what was returned is
- * actually a free, empty inode. If we are not allocating an inode,
- * the check we didn't find a free inode.
+ * Check the inode free state is valid. This also detects lookup
+ * racing with unlinks.
*/
- if (flags & XFS_IGET_CREATE) {
- if (VFS_I(ip)->i_mode != 0) {
- xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx not marked free on disk",
- ino);
- error = -EFSCORRUPTED;
- goto out_destroy;
- }
- if (ip->i_d.di_nblocks != 0) {
- xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx has blocks allocated!",
- ino);
- error = -EFSCORRUPTED;
- goto out_destroy;
- }
- } else if (VFS_I(ip)->i_mode == 0) {
- error = -ENOENT;
+ error = xfs_iget_check_free_state(ip, flags);
+ if (error)
goto out_destroy;
- }
/*
* Preload the radix tree so we can insert safely under the
@@ -1802,3 +1826,21 @@ xfs_inode_clear_cowblocks_tag(
return __xfs_inode_clear_blocks_tag(ip,
trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
}
+
+/* Disable post-EOF and CoW block auto-reclamation. */
+void
+xfs_icache_disable_reclaim(
+ struct xfs_mount *mp)
+{
+ cancel_delayed_work_sync(&mp->m_eofblocks_work);
+ cancel_delayed_work_sync(&mp->m_cowblocks_work);
+}
+
+/* Enable post-EOF and CoW block auto-reclamation. */
+void
+xfs_icache_enable_reclaim(
+ struct xfs_mount *mp)
+{
+ xfs_queue_eofblocks(mp);
+ xfs_queue_cowblocks(mp);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index d4a77588eca1..d69a0f5a6a73 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -131,4 +131,7 @@ xfs_fs_eofblocks_from_user(
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
+void xfs_icache_disable_reclaim(struct xfs_mount *mp);
+void xfs_icache_enable_reclaim(struct xfs_mount *mp);
+
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 865ad1373e5e..5da9599156ed 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -91,7 +91,7 @@ xfs_icreate_item_unlock(
{
struct xfs_icreate_item *icp = ICR_ITEM(lip);
- if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
kmem_zone_free(xfs_icreate_zone, icp);
return;
}
@@ -184,5 +184,5 @@ xfs_icreate_log(
xfs_trans_add_item(tp, &icp->ic_item);
tp->t_flags |= XFS_TRANS_DIRTY;
- icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags);
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3e3aab3888fa..05207a64dd53 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -498,7 +498,7 @@ again:
if (!try_lock) {
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = (xfs_log_item_t *)ips[j]->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL))
+ if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
try_lock++;
}
}
@@ -598,7 +598,7 @@ xfs_lock_two_inodes(
* and try again.
*/
lp = (xfs_log_item_t *)ip0->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
xfs_iunlock(ip0, ip0_mode);
if ((++attempts % 5) == 0)
@@ -791,6 +791,18 @@ xfs_ialloc(
ASSERT(*ialloc_context == NULL);
/*
+ * Protect against obviously corrupt allocation btree records. Later
+ * xfs_iget checks will catch re-allocation of other active in-memory
+ * and on-disk inodes. If we don't catch reallocating the parent inode
+ * here we will deadlock in xfs_iget() so we have to do these checks
+ * first.
+ */
+ if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
+ xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+ return -EFSCORRUPTED;
+ }
+
+ /*
* Get the in-core inode with the lock held exclusively.
* This is because we're setting fields here we need
* to prevent others from looking at until we're done.
@@ -972,10 +984,8 @@ xfs_dir_ialloc(
xfs_nlink_t nlink,
dev_t rdev,
prid_t prid, /* project id */
- xfs_inode_t **ipp, /* pointer to inode; it will be
+ xfs_inode_t **ipp) /* pointer to inode; it will be
locked. */
- int *committed)
-
{
xfs_trans_t *tp;
xfs_inode_t *ip;
@@ -1050,8 +1060,6 @@ xfs_dir_ialloc(
}
code = xfs_trans_roll(&tp);
- if (committed != NULL)
- *committed = 1;
/*
* Re-attach the quota info that we detached from prev trx.
@@ -1088,9 +1096,6 @@ xfs_dir_ialloc(
}
ASSERT(!ialloc_context && ip);
- } else {
- if (committed != NULL)
- *committed = 0;
}
*ipp = ip;
@@ -1203,6 +1208,7 @@ xfs_create(
unlock_dp_on_error = true;
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/*
* Reserve disk quota and the inode.
@@ -1217,8 +1223,7 @@ xfs_create(
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip,
- NULL);
+ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1309,7 +1314,6 @@ xfs_create(
int
xfs_create_tmpfile(
struct xfs_inode *dp,
- struct dentry *dentry,
umode_t mode,
struct xfs_inode **ipp)
{
@@ -1351,7 +1355,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL);
+ error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1420,11 +1424,11 @@ xfs_link(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- error = xfs_qm_dqattach(sip, 0);
+ error = xfs_qm_dqattach(sip);
if (error)
goto std_return;
- error = xfs_qm_dqattach(tdp, 0);
+ error = xfs_qm_dqattach(tdp);
if (error)
goto std_return;
@@ -1460,6 +1464,7 @@ xfs_link(
}
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/*
* Handle initial link state of O_TMPFILE inode
@@ -1543,11 +1548,12 @@ xfs_itruncate_clear_reflink_flags(
* dirty on error so that transactions can be easily aborted if possible.
*/
int
-xfs_itruncate_extents(
+xfs_itruncate_extents_flags(
struct xfs_trans **tpp,
struct xfs_inode *ip,
int whichfork,
- xfs_fsize_t new_size)
+ xfs_fsize_t new_size,
+ int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
@@ -1570,6 +1576,8 @@ xfs_itruncate_extents(
trace_xfs_itruncate_extents_start(ip, new_size);
+ flags |= xfs_bmapi_aflag(whichfork);
+
/*
* Since it is possible for space to become allocated beyond
* the end of the file (in a crash where the space is allocated
@@ -1588,12 +1596,9 @@ xfs_itruncate_extents(
unmap_len = last_block - first_unmap_block + 1;
while (!done) {
xfs_defer_init(&dfops, &first_block);
- error = xfs_bunmapi(tp, ip,
- first_unmap_block, unmap_len,
- xfs_bmapi_aflag(whichfork),
- XFS_ITRUNC_MAX_EXTENTS,
- &first_block, &dfops,
- &done);
+ error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
+ XFS_ITRUNC_MAX_EXTENTS, &first_block,
+ &dfops, &done);
if (error)
goto out_bmap_cancel;
@@ -1611,13 +1616,15 @@ xfs_itruncate_extents(
goto out;
}
- /* Remove all pending CoW reservations. */
- error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
- last_block, true);
- if (error)
- goto out;
+ if (whichfork == XFS_DATA_FORK) {
+ /* Remove all pending CoW reservations. */
+ error = xfs_reflink_cancel_cow_blocks(ip, &tp,
+ first_unmap_block, last_block, true);
+ if (error)
+ goto out;
- xfs_itruncate_clear_reflink_flags(ip);
+ xfs_itruncate_clear_reflink_flags(ip);
+ }
/*
* Always re-log the inode so that our permanent transaction can keep
@@ -1818,6 +1825,7 @@ xfs_inactive_ifree(
xfs_trans_ijoin(tp, ip, 0);
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
error = xfs_ifree(tp, ip, &dfops);
if (error) {
/*
@@ -1918,7 +1926,7 @@ xfs_inactive(
ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return;
@@ -2581,11 +2589,11 @@ xfs_remove(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- error = xfs_qm_dqattach(dp, 0);
+ error = xfs_qm_dqattach(dp);
if (error)
goto std_return;
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
goto std_return;
@@ -2654,6 +2662,7 @@ xfs_remove(
goto out_trans_cancel;
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
error = xfs_dir_removename(tp, dp, name, ip->i_ino,
&first_block, &dfops, resblks);
if (error) {
@@ -2903,7 +2912,7 @@ xfs_rename_alloc_whiteout(
struct xfs_inode *tmpfile;
int error;
- error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+ error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
if (error)
return error;
@@ -3021,6 +3030,7 @@ xfs_rename(
}
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 132d8aa2afc4..00fee6824745 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -393,8 +393,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
umode_t mode, dev_t rdev, struct xfs_inode **ipp);
-int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
- umode_t mode, struct xfs_inode **ipp);
+int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
+ struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -415,8 +415,8 @@ uint xfs_ilock_attr_map_shared(struct xfs_inode *);
uint xfs_ip2xflags(struct xfs_inode *);
int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_defer_ops *);
-int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
- int, xfs_fsize_t);
+int xfs_itruncate_extents_flags(struct xfs_trans **,
+ struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
@@ -431,7 +431,17 @@ xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
xfs_nlink_t, dev_t, prid_t,
- struct xfs_inode **, int *);
+ struct xfs_inode **);
+
+static inline int
+xfs_itruncate_extents(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fsize_t new_size)
+{
+ return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
+}
/* from xfs_file.c */
enum xfs_prealloc_flags {
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 34b91b789702..3e5b8574818e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -518,7 +518,7 @@ xfs_inode_item_push(
* The buffer containing this item failed to be written back
* previously. Resubmit the buffer for IO.
*/
- if (lip->li_flags & XFS_LI_FAILED) {
+ if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
@@ -729,14 +729,14 @@ xfs_iflush_done(
*/
iip = INODE_ITEM(blip);
if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
- (blip->li_flags & XFS_LI_FAILED))
+ test_bit(XFS_LI_FAILED, &blip->li_flags))
need_ail++;
}
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
- lip->li_flags & XFS_LI_FAILED)
+ test_bit(XFS_LI_FAILED, &lip->li_flags))
need_ail++;
/*
@@ -803,7 +803,7 @@ xfs_iflush_abort(
xfs_inode_log_item_t *iip = ip->i_itemp;
if (iip) {
- if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+ if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
xfs_trans_ail_remove(&iip->ili_item,
stale ? SHUTDOWN_LOG_IO_ERROR :
SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 89fb1eb80aae..5dd9e22b4a4c 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1103,7 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate(
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
+ if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+ sb->s_blocksize))
return -EINVAL;
}
@@ -1811,6 +1812,88 @@ xfs_ioc_swapext(
return error;
}
+static int
+xfs_ioc_getlabel(
+ struct xfs_mount *mp,
+ char __user *user_label)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ char label[XFSLABEL_MAX + 1];
+
+ /* Paranoia */
+ BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
+
+ spin_lock(&mp->m_sb_lock);
+ strncpy(label, sbp->sb_fname, sizeof(sbp->sb_fname));
+ spin_unlock(&mp->m_sb_lock);
+
+ /* xfs on-disk label is 12 chars, be sure we send a null to user */
+ label[XFSLABEL_MAX] = '\0';
+ if (copy_to_user(user_label, label, sizeof(sbp->sb_fname)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+xfs_ioc_setlabel(
+ struct file *filp,
+ struct xfs_mount *mp,
+ char __user *newlabel)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ char label[XFSLABEL_MAX + 1];
+ size_t len;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much
+ * smaller, at 12 bytes. We copy one more to be sure we find the
+ * (required) NULL character to test the incoming label length.
+ * NB: The on disk label doesn't need to be null terminated.
+ */
+ if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1))
+ return -EFAULT;
+ len = strnlen(label, XFSLABEL_MAX + 1);
+ if (len > sizeof(sbp->sb_fname))
+ return -EINVAL;
+
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname));
+ strncpy(sbp->sb_fname, label, sizeof(sbp->sb_fname));
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Now we do several things to satisfy userspace.
+ * In addition to normal logging of the primary superblock, we also
+ * immediately write these changes to sector zero for the primary, then
+ * update all backup supers (as xfs_db does for a label change), then
+ * invalidate the block device page cache. This is so that any prior
+ * buffered reads from userspace (i.e. from blkid) are invalidated,
+ * and userspace will see the newly-written label.
+ */
+ error = xfs_sync_sb_buf(mp);
+ if (error)
+ goto out;
+ /*
+ * growfs also updates backup supers so lock against that.
+ */
+ mutex_lock(&mp->m_growlock);
+ error = xfs_update_secondary_sbs(mp);
+ mutex_unlock(&mp->m_growlock);
+
+ invalidate_bdev(mp->m_ddev_targp->bt_bdev);
+
+out:
+ mnt_drop_write_file(filp);
+ return error;
+}
+
/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
@@ -1834,6 +1917,10 @@ xfs_file_ioctl(
switch (cmd) {
case FITRIM:
return xfs_ioc_trim(mp, arg);
+ case FS_IOC_GETFSLABEL:
+ return xfs_ioc_getlabel(mp, arg);
+ case FS_IOC_SETFSLABEL:
+ return xfs_ioc_setlabel(filp, mp, arg);
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 046469fcc1b8..c6ce6f9335b6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -224,7 +224,7 @@ xfs_iomap_write_direct(
* necessary and move on to transaction setup.
*/
xfs_iunlock(ip, lockmode);
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -576,7 +576,7 @@ xfs_file_iomap_begin_delay(
goto done;
}
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
@@ -692,7 +692,7 @@ xfs_iomap_write_allocate(
/*
* Make sure that the dquots are there.
*/
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -946,8 +946,11 @@ error_on_bmapi_transaction:
return error;
}
-static inline bool imap_needs_alloc(struct inode *inode,
- struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool
+imap_needs_alloc(
+ struct inode *inode,
+ struct xfs_bmbt_irec *imap,
+ int nimaps)
{
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
@@ -955,31 +958,58 @@ static inline bool imap_needs_alloc(struct inode *inode,
(IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
}
-static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool
+needs_cow_for_zeroing(
+ struct xfs_bmbt_irec *imap,
+ int nimaps)
{
return nimaps &&
imap->br_startblock != HOLESTARTBLOCK &&
imap->br_state != XFS_EXT_UNWRITTEN;
}
-static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+static int
+xfs_ilock_for_iomap(
+ struct xfs_inode *ip,
+ unsigned flags,
+ unsigned *lockmode)
{
+ unsigned mode = XFS_ILOCK_SHARED;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
- return true;
+ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) {
+ /*
+ * FIXME: It could still overwrite on unshared extents and not
+ * need allocation.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+ mode = XFS_ILOCK_EXCL;
+ }
/*
- * Extents not yet cached requires exclusive access, don't block.
- * This is an opencoded xfs_ilock_data_map_shared() to cater for the
+ * Extents not yet cached requires exclusive access, don't block. This
+ * is an opencoded xfs_ilock_data_map_shared() call but with
* non-blocking behaviour.
*/
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- !(ip->i_df.if_flags & XFS_IFEXTENTS))
- return true;
- return false;
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+ mode = XFS_ILOCK_EXCL;
+ }
+
+ if (flags & IOMAP_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, mode);
+ }
+
+ *lockmode = mode;
+ return 0;
}
static int
@@ -1007,19 +1037,15 @@ xfs_file_iomap_begin(
return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
}
- if (need_excl_ilock(ip, flags))
- lockmode = XFS_ILOCK_EXCL;
- else
- lockmode = XFS_ILOCK_SHARED;
-
- if (flags & IOMAP_NOWAIT) {
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, lockmode))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, lockmode);
- }
+ /*
+ * Lock the inode in the manner required for the specified operation and
+ * check for as many conditions that would result in blocking as
+ * possible. This removes most of the non-blocking checks from the
+ * mapping code below.
+ */
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
ASSERT(offset <= mp->m_super->s_maxbytes);
if (offset > mp->m_super->s_maxbytes - length)
@@ -1040,19 +1066,21 @@ xfs_file_iomap_begin(
goto out_unlock;
}
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
+ /* Non-modifying mapping requested, so we are done */
+ if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
+ goto out_found;
+
+ /*
+ * Break shared extents if necessary. Checks for non-blocking IO have
+ * been done up front, so we don't need to do them here.
+ */
+ if (xfs_is_reflink_inode(ip)) {
+ /* if zeroing doesn't need COW allocation, then we are done. */
+ if ((flags & IOMAP_ZERO) &&
+ !needs_cow_for_zeroing(&imap, nimaps))
+ goto out_found;
+
if (flags & IOMAP_DIRECT) {
- /*
- * A reflinked inode will result in CoW alloc.
- * FIXME: It could still overwrite on unshared extents
- * and not need allocation.
- */
- if (flags & IOMAP_NOWAIT) {
- error = -EAGAIN;
- goto out_unlock;
- }
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &shared,
&lockmode);
@@ -1068,46 +1096,45 @@ xfs_file_iomap_begin(
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
- /*
- * If nowait is set bail since we are going to make
- * allocations.
- */
- if (flags & IOMAP_NOWAIT) {
- error = -EAGAIN;
- goto out_unlock;
- }
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
- * pages to keep the chunks of work done where somewhat symmetric
- * with the work writeback does. This is a completely arbitrary
- * number pulled out of thin air as a best guess for initial
- * testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- length = min_t(loff_t, length, 1024 * PAGE_SIZE);
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
- if (error)
- return error;
+ /* Don't need to allocate over holes when doing zeroing operations. */
+ if (flags & IOMAP_ZERO)
+ goto out_found;
- iomap->flags = IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
- } else {
- ASSERT(nimaps);
+ if (!imap_needs_alloc(inode, &imap, nimaps))
+ goto out_found;
- xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ /* If nowait is set bail since we are going to make allocations. */
+ if (flags & IOMAP_NOWAIT) {
+ error = -EAGAIN;
+ goto out_unlock;
}
+ /*
+ * We cap the maximum length we map to a sane size to keep the chunks
+ * of work done where somewhat symmetric with the work writeback does.
+ * This is a completely arbitrary number pulled out of thin air as a
+ * best guess for initial testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until the
+ * lower level functions are updated.
+ */
+ length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It is unlocked on
+ * return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
+ if (error)
+ return error;
+
+ iomap->flags = IOMAP_F_NEW;
+ trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+
+out_finish:
if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
& ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
@@ -1117,6 +1144,13 @@ xfs_file_iomap_begin(
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
+
+out_found:
+ ASSERT(nimaps);
+ xfs_iunlock(ip, lockmode);
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ goto out_finish;
+
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e0307fbff911..b0eb49bb4918 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -177,7 +177,7 @@ xfs_generic_create(
if (!tmpfile) {
error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
} else {
- error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+ error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
}
if (unlikely(error))
goto out_free_acl;
@@ -260,6 +260,7 @@ xfs_vn_lookup(
struct dentry *dentry,
unsigned int flags)
{
+ struct inode *inode;
struct xfs_inode *cip;
struct xfs_name name;
int error;
@@ -269,14 +270,13 @@ xfs_vn_lookup(
xfs_dentry_to_name(&name, dentry);
error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
- if (unlikely(error)) {
- if (unlikely(error != -ENOENT))
- return ERR_PTR(error);
- d_add(dentry, NULL);
- return NULL;
- }
-
- return d_splice_alias(VFS_I(cip), dentry);
+ if (likely(!error))
+ inode = VFS_I(cip);
+ else if (likely(error == -ENOENT))
+ inode = NULL;
+ else
+ inode = ERR_PTR(error);
+ return d_splice_alias(inode, dentry);
}
STATIC struct dentry *
@@ -855,7 +855,7 @@ xfs_setattr_size(
/*
* Make sure that the dquots are attached to the inode.
*/
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1195,6 +1195,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = {
.update_time = xfs_vn_update_time,
};
+/* Figure out if this file actually supports DAX. */
+static bool
+xfs_inode_supports_dax(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /* Only supported on non-reflinked files. */
+ if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip))
+ return false;
+
+ /* DAX mount option or DAX iflag must be set. */
+ if (!(mp->m_flags & XFS_MOUNT_DAX) &&
+ !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+ return false;
+
+ /* Block size must match page size */
+ if (mp->m_sb.sb_blocksize != PAGE_SIZE)
+ return false;
+
+ /* Device has to support DAX too. */
+ return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL;
+}
+
STATIC void
xfs_diflags_to_iflags(
struct inode *inode,
@@ -1213,11 +1237,7 @@ xfs_diflags_to_iflags(
inode->i_flags |= S_SYNC;
if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- if (S_ISREG(inode->i_mode) &&
- ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
- !xfs_is_reflink_inode(ip) &&
- (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
- ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+ if (xfs_inode_supports_dax(ip))
inode->i_flags |= S_DAX;
}
@@ -1285,7 +1305,10 @@ xfs_setup_iops(
case S_IFREG:
inode->i_op = &xfs_inode_operations;
inode->i_fop = &xfs_file_operations;
- inode->i_mapping->a_ops = &xfs_address_space_operations;
+ if (IS_DAX(inode))
+ inode->i_mapping->a_ops = &xfs_dax_aops;
+ else
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
break;
case S_IFDIR:
if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b9c9c848146b..c21039f27e39 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -560,7 +560,6 @@ xfs_log_done(
*/
int
xfs_log_notify(
- struct xfs_mount *mp,
struct xlog_in_core *iclog,
xfs_log_callback_t *cb)
{
@@ -1048,6 +1047,7 @@ xfs_log_item_init(
INIT_LIST_HEAD(&item->li_ail);
INIT_LIST_HEAD(&item->li_cil);
INIT_LIST_HEAD(&item->li_bio_list);
+ INIT_LIST_HEAD(&item->li_trans);
}
/*
@@ -2111,10 +2111,10 @@ xlog_print_tic_res(
*/
void
xlog_print_trans(
- struct xfs_trans *tp)
+ struct xfs_trans *tp)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_log_item_desc *lidp;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_log_item *lip;
/* dump core transaction and ticket info */
xfs_warn(mp, "transaction summary:");
@@ -2125,15 +2125,14 @@ xlog_print_trans(
xlog_print_tic_res(mp, tp->t_ticket);
/* dump each log item */
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
struct xfs_log_vec *lv = lip->li_lv;
struct xfs_log_iovec *vec;
int i;
xfs_warn(mp, "log item: ");
xfs_warn(mp, " type = 0x%x", lip->li_type);
- xfs_warn(mp, " flags = 0x%x", lip->li_flags);
+ xfs_warn(mp, " flags = 0x%lx", lip->li_flags);
if (!lv)
continue;
xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 7e2d62922a16..fa8ad31d587f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -141,8 +141,7 @@ int xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_notify(struct xfs_mount *mp,
- struct xlog_in_core *iclog,
+int xfs_log_notify(struct xlog_in_core *iclog,
struct xfs_log_callback *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
struct xlog_in_core *iclog);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index cb376ac8a595..c15687724728 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -141,10 +141,9 @@ xlog_cil_alloc_shadow_bufs(
struct xlog *log,
struct xfs_trans *tp)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
struct xfs_log_vec *lv;
int niovecs = 0;
int nbytes = 0;
@@ -152,7 +151,7 @@ xlog_cil_alloc_shadow_bufs(
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
/* get number of vecs and size of data to be stored */
@@ -317,7 +316,7 @@ xlog_cil_insert_format_items(
int *diff_len,
int *diff_iovecs)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
/* Bail out if we didn't find a log item. */
@@ -326,15 +325,14 @@ xlog_cil_insert_format_items(
return;
}
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
struct xfs_log_vec *lv;
struct xfs_log_vec *old_lv = NULL;
struct xfs_log_vec *shadow;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
/*
@@ -406,7 +404,7 @@ xlog_cil_insert_items(
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
int len = 0;
int diff_iovecs = 0;
int iclog_space;
@@ -479,11 +477,10 @@ xlog_cil_insert_items(
* We do this here so we only need to take the CIL lock once during
* the transaction commit.
*/
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
/*
@@ -848,7 +845,7 @@ restart:
/* attach all the transactions w/ busy extents to iclog */
ctx->log_cb.cb_func = xlog_cil_committed;
ctx->log_cb.cb_arg = ctx;
- error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+ error = xfs_log_notify(commit_iclog, &ctx->log_cb);
if (error)
goto out_abort;
@@ -1013,6 +1010,7 @@ xfs_log_commit_cil(
*commit_lsn = xc_commit_lsn;
xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2b2383f1895e..06a09cb948b5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2702,7 +2702,7 @@ xlog_recover_do_reg_buffer(
goto next;
}
fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
- -1, 0, 0);
+ -1, 0);
if (fa) {
xfs_alert(mp,
"dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -3348,7 +3348,7 @@ xlog_recover_dquot_pass2(
*/
dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
- fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0);
+ fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
dq_f->qlf_id, fa);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a901b86772f8..73ed8fec0328 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1072,9 +1072,7 @@ xfs_unmountfs(
uint64_t resblks;
int error;
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
-
+ xfs_icache_disable_reclaim(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f8a674d7f092..70eea7ae2876 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -112,6 +112,7 @@ struct xfs_mru_cache {
xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
struct delayed_work work; /* Workqueue data for reaping. */
unsigned int queued; /* work has been queued */
+ void *data;
};
static struct workqueue_struct *xfs_mru_reap_wq;
@@ -259,7 +260,7 @@ _xfs_mru_cache_clear_reap_list(
list_for_each_entry_safe(elem, next, &tmp, list_node) {
list_del_init(&elem->list_node);
- mru->free_func(elem);
+ mru->free_func(mru->data, elem);
}
spin_lock(&mru->lock);
@@ -326,6 +327,7 @@ xfs_mru_cache_uninit(void)
int
xfs_mru_cache_create(
struct xfs_mru_cache **mrup,
+ void *data,
unsigned int lifetime_ms,
unsigned int grp_count,
xfs_mru_cache_free_func_t free_func)
@@ -369,7 +371,7 @@ xfs_mru_cache_create(
mru->grp_time = grp_time;
mru->free_func = free_func;
-
+ mru->data = data;
*mrup = mru;
exit:
@@ -492,7 +494,7 @@ xfs_mru_cache_delete(
elem = xfs_mru_cache_remove(mru, key);
if (elem)
- mru->free_func(elem);
+ mru->free_func(mru->data, elem);
}
/*
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index fb5245ba5ff7..b3f3fbdfcc47 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -26,13 +26,13 @@ struct xfs_mru_cache_elem {
};
/* Function pointer type for callback to free a client's data pointer. */
-typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem);
+typedef void (*xfs_mru_cache_free_func_t)(void *, struct xfs_mru_cache_elem *);
int xfs_mru_cache_init(void);
void xfs_mru_cache_uninit(void);
-int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
- unsigned int grp_count,
- xfs_mru_cache_free_func_t free_func);
+int xfs_mru_cache_create(struct xfs_mru_cache **mrup, void *data,
+ unsigned int lifetime_ms, unsigned int grp_count,
+ xfs_mru_cache_free_func_t free_func);
void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
struct xfs_mru_cache_elem *elem);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5b848f4b637f..c3e014bfc848 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -161,10 +161,7 @@ xfs_qm_dqpurge(
* to purge this dquot anyway, so we go ahead regardless.
*/
error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed",
- __func__, dqp);
- } else {
+ if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
}
@@ -173,7 +170,7 @@ xfs_qm_dqpurge(
ASSERT(atomic_read(&dqp->q_pincount) == 0);
ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
- !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+ !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
@@ -265,7 +262,7 @@ xfs_qm_dqattach_one(
xfs_inode_t *ip,
xfs_dqid_t id,
uint type,
- uint doalloc,
+ bool doalloc,
xfs_dquot_t **IO_idqpp)
{
xfs_dquot_t *dqp;
@@ -291,7 +288,7 @@ xfs_qm_dqattach_one(
* exist on disk and we didn't ask it to allocate; ESRCH if quotas got
* turned off suddenly.
*/
- error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp);
+ error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp);
if (error)
return error;
@@ -326,14 +323,14 @@ xfs_qm_need_dqattach(
/*
* Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
* into account.
- * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * If @doalloc is true, the dquot(s) will be allocated if needed.
* Inode may get unlocked and relocked in here, and the caller must deal with
* the consequences.
*/
int
xfs_qm_dqattach_locked(
xfs_inode_t *ip,
- uint flags)
+ bool doalloc)
{
xfs_mount_t *mp = ip->i_mount;
int error = 0;
@@ -345,8 +342,7 @@ xfs_qm_dqattach_locked(
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
- flags & XFS_QMOPT_DQALLOC,
- &ip->i_udquot);
+ doalloc, &ip->i_udquot);
if (error)
goto done;
ASSERT(ip->i_udquot);
@@ -354,8 +350,7 @@ xfs_qm_dqattach_locked(
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- flags & XFS_QMOPT_DQALLOC,
- &ip->i_gdquot);
+ doalloc, &ip->i_gdquot);
if (error)
goto done;
ASSERT(ip->i_gdquot);
@@ -363,8 +358,7 @@ xfs_qm_dqattach_locked(
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
- flags & XFS_QMOPT_DQALLOC,
- &ip->i_pdquot);
+ doalloc, &ip->i_pdquot);
if (error)
goto done;
ASSERT(ip->i_pdquot);
@@ -381,8 +375,7 @@ done:
int
xfs_qm_dqattach(
- struct xfs_inode *ip,
- uint flags)
+ struct xfs_inode *ip)
{
int error;
@@ -390,7 +383,7 @@ xfs_qm_dqattach(
return 0;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_qm_dqattach_locked(ip, flags);
+ error = xfs_qm_dqattach_locked(ip, false);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -479,11 +472,8 @@ xfs_qm_dquot_isolate(
spin_unlock(lru_lock);
error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed",
- __func__, dqp);
+ if (error)
goto out_unlock_dirty;
- }
xfs_buf_delwri_queue(bp, &isol->buffers);
xfs_buf_relse(bp);
@@ -571,27 +561,88 @@ xfs_qm_set_defquota(
{
xfs_dquot_t *dqp;
struct xfs_def_quota *defq;
+ struct xfs_disk_dquot *ddqp;
int error;
- error = xfs_qm_dqread(mp, 0, type, 0, &dqp);
+ error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+ if (error)
+ return;
- if (!error) {
- xfs_disk_dquot_t *ddqp = &dqp->q_core;
+ ddqp = &dqp->q_core;
+ defq = xfs_get_defquota(dqp, qinf);
- defq = xfs_get_defquota(dqp, qinf);
+ /*
+ * Timers and warnings have been already set, let's just set the
+ * default limits for this quota type
+ */
+ defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ xfs_qm_dqdestroy(dqp);
+}
- /*
- * Timers and warnings have been already set, let's just set the
- * default limits for this quota type
- */
- defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- xfs_qm_dqdestroy(dqp);
- }
+/* Initialize quota time limits from the root dquot. */
+static void
+xfs_qm_init_timelimits(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qinf)
+{
+ struct xfs_disk_dquot *ddqp;
+ struct xfs_dquot *dqp;
+ uint type;
+ int error;
+
+ qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+ qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+ qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+ qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+ qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+ qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+
+ /*
+ * We try to get the limits from the superuser's limits fields.
+ * This is quite hacky, but it is standard quota practice.
+ *
+ * Since we may not have done a quotacheck by this point, just read
+ * the dquot without attaching it to any hashtables or lists.
+ *
+ * Timers and warnings are globally set by the first timer found in
+ * user/group/proj quota types, otherwise a default value is used.
+ * This should be split into different fields per quota type.
+ */
+ if (XFS_IS_UQUOTA_RUNNING(mp))
+ type = XFS_DQ_USER;
+ else if (XFS_IS_GQUOTA_RUNNING(mp))
+ type = XFS_DQ_GROUP;
+ else
+ type = XFS_DQ_PROJ;
+ error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+ if (error)
+ return;
+
+ ddqp = &dqp->q_core;
+ /*
+ * The warnings and timers set the grace period given to
+ * a user or group before he or she can not perform any
+ * more writing. If it is zero, a default is used.
+ */
+ if (ddqp->d_btimer)
+ qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer);
+ if (ddqp->d_itimer)
+ qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer);
+ if (ddqp->d_rtbtimer)
+ qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer);
+ if (ddqp->d_bwarns)
+ qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns);
+ if (ddqp->d_iwarns)
+ qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns);
+ if (ddqp->d_rtbwarns)
+ qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns);
+
+ xfs_qm_dqdestroy(dqp);
}
/*
@@ -600,11 +651,10 @@ xfs_qm_set_defquota(
*/
STATIC int
xfs_qm_init_quotainfo(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_quotainfo_t *qinf;
- int error;
- xfs_dquot_t *dqp;
+ struct xfs_quotainfo *qinf;
+ int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -636,52 +686,7 @@ xfs_qm_init_quotainfo(
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
- /*
- * We try to get the limits from the superuser's limits fields.
- * This is quite hacky, but it is standard quota practice.
- *
- * Since we may not have done a quotacheck by this point, just read
- * the dquot without attaching it to any hashtables or lists.
- *
- * Timers and warnings are globally set by the first timer found in
- * user/group/proj quota types, otherwise a default value is used.
- * This should be split into different fields per quota type.
- */
- error = xfs_qm_dqread(mp, 0,
- XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
- (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
- XFS_DQ_PROJ),
- 0, &dqp);
-
- if (!error) {
- xfs_disk_dquot_t *ddqp = &dqp->q_core;
-
- /*
- * The warnings and timers set the grace period given to
- * a user or group before he or she can not perform any
- * more writing. If it is zero, a default is used.
- */
- qinf->qi_btimelimit = ddqp->d_btimer ?
- be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = ddqp->d_itimer ?
- be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
- be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = ddqp->d_bwarns ?
- be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = ddqp->d_iwarns ?
- be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
- be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
- xfs_qm_dqdestroy(dqp);
- } else {
- qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
- }
+ xfs_qm_init_timelimits(mp, qinf);
if (XFS_IS_UQUOTA_RUNNING(mp))
xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
@@ -748,7 +753,6 @@ xfs_qm_qino_alloc(
{
xfs_trans_t *tp;
int error;
- int committed;
bool need_alloc = true;
*ip = NULL;
@@ -788,8 +792,7 @@ xfs_qm_qino_alloc(
return error;
if (need_alloc) {
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip,
- &committed);
+ error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip);
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -867,9 +870,9 @@ xfs_qm_reset_dqcounts(
* find uninitialised dquot blks. See comment in
* xfs_dquot_verify.
*/
- fa = xfs_dquot_verify(mp, ddq, id + j, type, 0);
+ fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type);
if (fa)
- xfs_dquot_repair(mp, ddq, id + j, type);
+ xfs_dqblk_repair(mp, &dqb[j], id + j, type);
/*
* Reset type in case we are reusing group quota file for
@@ -895,7 +898,7 @@ xfs_qm_reset_dqcounts(
}
STATIC int
-xfs_qm_dqiter_bufs(
+xfs_qm_reset_dqcounts_all(
struct xfs_mount *mp,
xfs_dqid_t firstid,
xfs_fsblock_t bno,
@@ -963,11 +966,11 @@ xfs_qm_dqiter_bufs(
}
/*
- * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
- * caller supplied function for every chunk of dquots that we find.
+ * Iterate over all allocated dquot blocks in this quota inode, zeroing all
+ * counters for every chunk of dquots that we find.
*/
STATIC int
-xfs_qm_dqiterate(
+xfs_qm_reset_dqcounts_buf(
struct xfs_mount *mp,
struct xfs_inode *qip,
uint flags,
@@ -1043,7 +1046,7 @@ xfs_qm_dqiterate(
* Iterate thru all the blks in the extent and
* reset the counters of all the dquots inside them.
*/
- error = xfs_qm_dqiter_bufs(mp, firstid,
+ error = xfs_qm_reset_dqcounts_all(mp, firstid,
map[i].br_startblock,
map[i].br_blockcount,
flags, buffer_list);
@@ -1068,16 +1071,17 @@ out:
STATIC int
xfs_qm_quotacheck_dqadjust(
struct xfs_inode *ip,
- xfs_dqid_t id,
uint type,
xfs_qcnt_t nblks,
xfs_qcnt_t rtblks)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_dquot *dqp;
+ xfs_dqid_t id;
int error;
- error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ id = xfs_qm_id_for_quotatype(ip, type);
+ error = xfs_qm_dqget(mp, id, type, true, &dqp);
if (error) {
/*
* Shouldn't be able to turn off quotas here.
@@ -1150,13 +1154,10 @@ xfs_qm_dqusage_adjust(
}
/*
- * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
- * interface expects the inode to be exclusively locked because that's
- * the case in all other instances. It's OK that we do this because
- * quotacheck is done only at mount time.
+ * We don't _need_ to take the ilock EXCL here because quotacheck runs
+ * at mount time and therefore nobody will be racing chown/chproj.
*/
- error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
- &ip);
+ error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
if (error) {
*res = BULKSTAT_RV_NOTHING;
return error;
@@ -1191,33 +1192,31 @@ xfs_qm_dqusage_adjust(
* and quotaoffs don't race. (Quotachecks happen at mount time only).
*/
if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
- XFS_DQ_USER, nblks, rtblks);
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks,
+ rtblks);
if (error)
goto error0;
}
if (XFS_IS_GQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
- XFS_DQ_GROUP, nblks, rtblks);
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks,
+ rtblks);
if (error)
goto error0;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
- XFS_DQ_PROJ, nblks, rtblks);
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks,
+ rtblks);
if (error)
goto error0;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
IRELE(ip);
*res = BULKSTAT_RV_DIDONE;
return 0;
error0:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
IRELE(ip);
*res = BULKSTAT_RV_GIVEUP;
return error;
@@ -1249,9 +1248,8 @@ xfs_qm_flush_one(
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
- DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen);
- bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
+ bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0);
if (!bp) {
error = -EINVAL;
goto out_unlock;
@@ -1309,7 +1307,7 @@ xfs_qm_quotacheck(
* We don't log our changes till later.
*/
if (uip) {
- error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA,
&buffer_list);
if (error)
goto error_return;
@@ -1317,7 +1315,7 @@ xfs_qm_quotacheck(
}
if (gip) {
- error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA,
&buffer_list);
if (error)
goto error_return;
@@ -1325,7 +1323,7 @@ xfs_qm_quotacheck(
}
if (pip) {
- error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA,
&buffer_list);
if (error)
goto error_return;
@@ -1677,7 +1675,7 @@ xfs_qm_vop_dqalloc(
* if necessary. The dquot(s) will not be locked.
*/
if (XFS_NOT_DQATTACHED(mp, ip)) {
- error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+ error = xfs_qm_dqattach_locked(ip, true);
if (error) {
xfs_iunlock(ip, lockflags);
return error;
@@ -1696,10 +1694,7 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, NULL, uid,
- XFS_DQ_USER,
- XFS_QMOPT_DQALLOC,
- &uq);
+ error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
@@ -1722,10 +1717,7 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
if (ip->i_d.di_gid != gid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, NULL, gid,
- XFS_DQ_GROUP,
- XFS_QMOPT_DQALLOC,
- &gq);
+ error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1741,10 +1733,8 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
- XFS_DQ_PROJ,
- XFS_QMOPT_DQALLOC,
- &pq);
+ error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
+ true, &pq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1935,7 +1925,7 @@ xfs_qm_vop_rename_dqattach(
*/
if (i == 0 || ip != i_tab[i-1]) {
if (XFS_NOT_DQATTACHED(mp, ip)) {
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 2975a822e9f0..e3129b280423 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -170,8 +170,10 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
- uint, struct qc_dqblk *, uint);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+ uint, struct qc_dqblk *);
+extern int xfs_qm_scall_getquota_next(struct xfs_mount *,
+ xfs_dqid_t *, uint, struct qc_dqblk *);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
struct qc_dqblk *);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2be6d2735ca9..36b89e2c5eb9 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -72,7 +72,7 @@ xfs_qm_statvfs(
xfs_mount_t *mp = ip->i_mount;
xfs_dquot_t *dqp;
- if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+ if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) {
xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9cb5c381b01c..3e05d300b14e 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -425,7 +425,7 @@ xfs_qm_scall_setqlim(
* a reference to the dquot, so it's safe to do this unlock/lock without
* it being reclaimed in the mean time.
*/
- error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ error = xfs_qm_dqget(mp, id, type, true, &dqp);
if (error) {
ASSERT(error != -ENOENT);
goto out_unlock;
@@ -622,39 +622,14 @@ out:
return error;
}
-
-int
-xfs_qm_scall_getquota(
+/* Fill out the quota context. */
+static void
+xfs_qm_scall_getquota_fill_qc(
struct xfs_mount *mp,
- xfs_dqid_t *id,
uint type,
- struct qc_dqblk *dst,
- uint dqget_flags)
+ const struct xfs_dquot *dqp,
+ struct qc_dqblk *dst)
{
- struct xfs_dquot *dqp;
- int error;
-
- /*
- * Try to get the dquot. We don't want it allocated on disk, so
- * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
- * exist, we'll get ENOENT back.
- */
- error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
- if (error)
- return error;
-
- /*
- * If everything's NULL, this dquot doesn't quite exist as far as
- * our utility programs are concerned.
- */
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- error = -ENOENT;
- goto out_put;
- }
-
- /* Fill in the ID we actually read from disk */
- *id = be32_to_cpu(dqp->q_core.d_id);
-
memset(dst, 0, sizeof(*dst));
dst->d_spc_hardlimit =
XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -696,7 +671,7 @@ xfs_qm_scall_getquota(
if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
(XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
(XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
- *id != 0) {
+ dqp->q_core.d_id != 0) {
if ((dst->d_space > dst->d_spc_softlimit) &&
(dst->d_spc_softlimit > 0)) {
ASSERT(dst->d_spc_timer != 0);
@@ -707,11 +682,69 @@ xfs_qm_scall_getquota(
}
}
#endif
+}
+
+/* Return the quota information for the dquot matching id. */
+int
+xfs_qm_scall_getquota(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct qc_dqblk *dst)
+{
+ struct xfs_dquot *dqp;
+ int error;
+
+ /*
+ * Try to get the dquot. We don't want it allocated on disk, so don't
+ * set doalloc. If it doesn't exist, we'll get ENOENT back.
+ */
+ error = xfs_qm_dqget(mp, id, type, false, &dqp);
+ if (error)
+ return error;
+
+ /*
+ * If everything's NULL, this dquot doesn't quite exist as far as
+ * our utility programs are concerned.
+ */
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ error = -ENOENT;
+ goto out_put;
+ }
+
+ xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
out_put:
xfs_qm_dqput(dqp);
return error;
}
+/*
+ * Return the quota information for the first initialized dquot whose id
+ * is at least as high as id.
+ */
+int
+xfs_qm_scall_getquota_next(
+ struct xfs_mount *mp,
+ xfs_dqid_t *id,
+ uint type,
+ struct qc_dqblk *dst)
+{
+ struct xfs_dquot *dqp;
+ int error;
+
+ error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+ if (error)
+ return error;
+
+ /* Fill in the ID we actually read from disk */
+ *id = be32_to_cpu(dqp->q_core.d_id);
+
+ xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
+ xfs_qm_dqput(dqp);
+ return error;
+}
STATIC int
xfs_dqrele_inode(
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index ce6506adab7b..3edf52b14919 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -48,6 +48,22 @@ struct xfs_trans;
(XFS_IS_PQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
+static inline uint
+xfs_quota_chkd_flag(
+ uint dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQ_USER:
+ return XFS_UQUOTA_CHKD;
+ case XFS_DQ_GROUP:
+ return XFS_GQUOTA_CHKD;
+ case XFS_DQ_PROJ:
+ return XFS_PQUOTA_CHKD;
+ default:
+ return 0;
+ }
+}
+
/*
* The structure kept inside the xfs_trans_t keep track of dquot changes
* within a transaction and apply them later.
@@ -90,8 +106,8 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
struct xfs_dquot *, struct xfs_dquot *,
struct xfs_dquot *, uint);
-extern int xfs_qm_dqattach(struct xfs_inode *, uint);
-extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
extern void xfs_qm_dqdetach(struct xfs_inode *);
extern void xfs_qm_dqrele(struct xfs_dquot *);
extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
@@ -132,7 +148,7 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
-#define xfs_qm_dqattach(ip, fl) (0)
+#define xfs_qm_dqattach(ip) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
#define xfs_qm_dqrele(d)
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a65108594a07..c93fc913dffb 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -239,8 +239,7 @@ xfs_fs_get_dqblk(
return -ESRCH;
id = from_kqid(&init_user_ns, qid);
- return xfs_qm_scall_getquota(mp, &id,
- xfs_quota_type(qid.type), qdq, 0);
+ return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq);
}
/* Return quota info for active quota >= this qid */
@@ -260,9 +259,8 @@ xfs_fs_get_nextdqblk(
return -ESRCH;
id = from_kqid(&init_user_ns, *qid);
- ret = xfs_qm_scall_getquota(mp, &id,
- xfs_quota_type(qid->type), qdq,
- XFS_QMOPT_DQNEXT);
+ ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type),
+ qdq);
if (ret)
return ret;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 7a39f40645f7..e5866b714d5f 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -52,6 +52,25 @@ xfs_cui_item_free(
kmem_zone_free(xfs_cui_zone, cuip);
}
+/*
+ * Freeing the CUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the CUI may not yet have been placed in the AIL
+ * when called by xfs_cui_release() from CUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the CUI.
+ */
+void
+xfs_cui_release(
+ struct xfs_cui_log_item *cuip)
+{
+ ASSERT(atomic_read(&cuip->cui_refcount) > 0);
+ if (atomic_dec_and_test(&cuip->cui_refcount)) {
+ xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_cui_item_free(cuip);
+ }
+}
+
+
STATIC void
xfs_cui_item_size(
struct xfs_log_item *lip,
@@ -140,8 +159,8 @@ STATIC void
xfs_cui_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_cui_item_free(CUI_ITEM(lip));
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
+ xfs_cui_release(CUI_ITEM(lip));
}
/*
@@ -211,24 +230,6 @@ xfs_cui_init(
return cuip;
}
-/*
- * Freeing the CUI requires that we remove it from the AIL if it has already
- * been placed there. However, the CUI may not yet have been placed in the AIL
- * when called by xfs_cui_release() from CUD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the CUI.
- */
-void
-xfs_cui_release(
- struct xfs_cui_log_item *cuip)
-{
- ASSERT(atomic_read(&cuip->cui_refcount) > 0);
- if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_cui_item_free(cuip);
- }
-}
-
static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_cud_log_item, cud_item);
@@ -309,7 +310,7 @@ xfs_cud_item_unlock(
{
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_cui_release(cudp->cud_cuip);
kmem_zone_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cdbd342a5249..713e857d9ffa 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -305,7 +305,7 @@ xfs_reflink_reserve_cow(
* Fork all the shared blocks from our write offset until the end of
* the extent.
*/
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach_locked(ip, false);
if (error)
return error;
@@ -431,7 +431,7 @@ retry:
if (error)
return error;
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out;
goto retry;
@@ -552,6 +552,9 @@ xfs_reflink_trim_irec_to_next_cow(
*
* If cancel_real is true this function cancels all COW fork extents for the
* inode; if cancel_real is false, real extents are not cleared.
+ *
+ * Caller must have already joined the inode to the current transaction. The
+ * inode will be joined to the transaction returned to the caller.
*/
int
xfs_reflink_cancel_cow_blocks(
@@ -592,7 +595,6 @@ xfs_reflink_cancel_cow_blocks(
if (error)
break;
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
- xfs_trans_ijoin(*tpp, ip, 0);
xfs_defer_init(&dfops, &firstfsb);
/* Free the CoW orphan record. */
@@ -1359,7 +1361,7 @@ xfs_reflink_remap_range(
goto out_unlock;
/* Attach dquots to dest inode before changing block map */
- ret = xfs_qm_dqattach(dest, 0);
+ ret = xfs_qm_dqattach(dest);
if (ret)
goto out_unlock;
@@ -1551,7 +1553,12 @@ next:
return 0;
}
-/* Clear the inode reflink flag if there are no shared extents. */
+/*
+ * Clear the inode reflink flag if there are no shared extents.
+ *
+ * The caller is responsible for joining the inode to the transaction passed in.
+ * The inode will be joined to the transaction that is returned to the caller.
+ */
int
xfs_reflink_clear_inode_flag(
struct xfs_inode *ip,
@@ -1578,7 +1585,6 @@ xfs_reflink_clear_inode_flag(
trace_xfs_reflink_unset_inode_flag(ip);
ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
- xfs_trans_ijoin(*tpp, ip, 0);
xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
return error;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 49d3124863a8..e5b5b3e7ef82 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -52,6 +52,24 @@ xfs_rui_item_free(
kmem_zone_free(xfs_rui_zone, ruip);
}
+/*
+ * Freeing the RUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the RUI may not yet have been placed in the AIL
+ * when called by xfs_rui_release() from RUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the RUI.
+ */
+void
+xfs_rui_release(
+ struct xfs_rui_log_item *ruip)
+{
+ ASSERT(atomic_read(&ruip->rui_refcount) > 0);
+ if (atomic_dec_and_test(&ruip->rui_refcount)) {
+ xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_rui_item_free(ruip);
+ }
+}
+
STATIC void
xfs_rui_item_size(
struct xfs_log_item *lip,
@@ -140,8 +158,8 @@ STATIC void
xfs_rui_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_rui_item_free(RUI_ITEM(lip));
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
+ xfs_rui_release(RUI_ITEM(lip));
}
/*
@@ -233,24 +251,6 @@ xfs_rui_copy_format(
return 0;
}
-/*
- * Freeing the RUI requires that we remove it from the AIL if it has already
- * been placed there. However, the RUI may not yet have been placed in the AIL
- * when called by xfs_rui_release() from RUD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the RUI.
- */
-void
-xfs_rui_release(
- struct xfs_rui_log_item *ruip)
-{
- ASSERT(atomic_read(&ruip->rui_refcount) > 0);
- if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_rui_item_free(ruip);
- }
-}
-
static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_rud_log_item, rud_item);
@@ -331,7 +331,7 @@ xfs_rud_item_unlock(
{
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_rui_release(rudp->rud_ruip);
kmem_zone_free(xfs_rud_zone, rudp);
}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index dfee3c991155..52632ab727f7 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,9 +23,14 @@
struct xfs_mount;
struct xfs_trans;
+/*
+ * XXX: Most of the realtime allocation functions deal in units of realtime
+ * extents, not realtime blocks. This looks funny when paired with the type
+ * name and screams for a larger cleanup.
+ */
struct xfs_rtalloc_rec {
- xfs_rtblock_t ar_startblock;
- xfs_rtblock_t ar_blockcount;
+ xfs_rtblock_t ar_startext;
+ xfs_rtblock_t ar_extcount;
};
typedef int (*xfs_rtalloc_query_range_fn)(
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 056e12b421eb..1cc79907b377 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -113,6 +113,7 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
}
}
+#ifdef CONFIG_PROC_FS
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
static int xqm_proc_show(struct seq_file *m, void *v)
@@ -124,18 +125,6 @@ static int xqm_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
- .open = xqm_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* legacy quota stats interface no 2 */
static int xqmstat_proc_show(struct seq_file *m, void *v)
{
@@ -147,22 +136,8 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
seq_putc(m, '\n');
return 0;
}
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xqmstat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_XFS_QUOTA */
-#ifdef CONFIG_PROC_FS
int
xfs_init_procfs(void)
{
@@ -174,11 +149,9 @@ xfs_init_procfs(void)
goto out;
#ifdef CONFIG_XFS_QUOTA
- if (!proc_create("fs/xfs/xqmstat", 0, NULL,
- &xqmstat_proc_fops))
+ if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show))
goto out;
- if (!proc_create("fs/xfs/xqm", 0, NULL,
- &xqm_proc_fops))
+ if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show))
goto out;
#endif
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 612c1d5348b3..ed67389f4948 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -63,7 +63,7 @@
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
-struct bio_set *xfs_ioend_bioset;
+struct bio_set xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
@@ -722,7 +722,7 @@ xfs_close_devices(
struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;
- xfs_free_buftarg(mp, mp->m_logdev_targp);
+ xfs_free_buftarg(mp->m_logdev_targp);
xfs_blkdev_put(logdev);
fs_put_dax(dax_logdev);
}
@@ -730,11 +730,11 @@ xfs_close_devices(
struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;
- xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ xfs_free_buftarg(mp->m_rtdev_targp);
xfs_blkdev_put(rtdev);
fs_put_dax(dax_rtdev);
}
- xfs_free_buftarg(mp, mp->m_ddev_targp);
+ xfs_free_buftarg(mp->m_ddev_targp);
fs_put_dax(dax_ddev);
}
@@ -808,9 +808,9 @@ xfs_open_devices(
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
- xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ xfs_free_buftarg(mp->m_rtdev_targp);
out_free_ddev_targ:
- xfs_free_buftarg(mp, mp->m_ddev_targp);
+ xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
xfs_blkdev_put(rtdev);
fs_put_dax(dax_rtdev);
@@ -1247,7 +1247,6 @@ xfs_quiesce_attr(
STATIC int
xfs_test_remount_options(
struct super_block *sb,
- struct xfs_mount *mp,
char *options)
{
int error = 0;
@@ -1278,7 +1277,7 @@ xfs_fs_remount(
int error;
/* First, check for complete junk; i.e. invalid options */
- error = xfs_test_remount_options(sb, mp, options);
+ error = xfs_test_remount_options(sb, options);
if (error)
return error;
@@ -1373,7 +1372,6 @@ xfs_fs_remount(
*/
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_queue_eofblocks(mp);
/* Recover any CoW blocks that never got remapped. */
error = xfs_reflink_recover_cow(mp);
@@ -1383,7 +1381,7 @@ xfs_fs_remount(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_queue_cowblocks(mp);
+ xfs_icache_enable_reclaim(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1393,8 +1391,13 @@ xfs_fs_remount(
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
+ /*
+ * Cancel background eofb scanning so it cannot race with the
+ * final log force+buftarg wait and deadlock the remount.
+ */
+ xfs_icache_disable_reclaim(mp);
+
/* Get rid of any leftover CoW reservations... */
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
error = xfs_icache_free_cowblocks(mp, NULL);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1417,12 +1420,6 @@ xfs_fs_remount(
*/
xfs_save_resvblks(mp);
- /*
- * Cancel background eofb scanning so it cannot race with the
- * final log force+buftarg wait and deadlock the remount.
- */
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
-
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
@@ -1442,6 +1439,7 @@ xfs_fs_freeze(
{
struct xfs_mount *mp = XFS_M(sb);
+ xfs_icache_disable_reclaim(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
return xfs_sync_sb(mp, true);
@@ -1455,6 +1453,7 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
+ xfs_icache_enable_reclaim(mp);
return 0;
}
@@ -1636,6 +1635,17 @@ xfs_fs_fill_super(
#endif
sb->s_op = &xfs_super_operations;
+ /*
+ * Delay mount work if the debug hook is set. This is debug
+ * instrumention to coordinate simulation of xfs mount failures with
+ * VFS superblock operations
+ */
+ if (xfs_globals.mount_delay) {
+ xfs_notice(mp, "Delaying mount for %d seconds.",
+ xfs_globals.mount_delay);
+ msleep(xfs_globals.mount_delay * 1000);
+ }
+
if (silent)
flags |= XFS_MFSI_QUIET;
@@ -1691,11 +1701,17 @@ xfs_fs_fill_super(
sb->s_flags |= SB_I_VERSION;
if (mp->m_flags & XFS_MOUNT_DAX) {
+ bool rtdev_is_dax = false, datadev_is_dax;
+
xfs_warn(mp,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- error = bdev_dax_supported(sb, sb->s_blocksize);
- if (error) {
+ datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+ sb->s_blocksize);
+ if (mp->m_rtdev_targp)
+ rtdev_is_dax = bdev_dax_supported(
+ mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
+ if (!rtdev_is_dax && !datadev_is_dax) {
xfs_alert(mp,
"DAX unsupported by block device. Turning off DAX.");
mp->m_flags &= ~XFS_MOUNT_DAX;
@@ -1762,6 +1778,7 @@ xfs_fs_fill_super(
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
out:
@@ -1779,6 +1796,10 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
+ /* if ->fill_super failed, we have no mount to tear down */
+ if (!sb->s_fs_info)
+ return;
+
xfs_notice(mp, "Unmounting Filesystem");
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -1788,6 +1809,8 @@ xfs_fs_put_super(
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
+
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
}
@@ -1807,6 +1830,9 @@ xfs_fs_nr_cached_objects(
struct super_block *sb,
struct shrink_control *sc)
{
+ /* Paranoia: catch incorrect calls during mount setup or teardown */
+ if (WARN_ON_ONCE(!sb->s_fs_info))
+ return 0;
return xfs_reclaim_inodes_count(XFS_M(sb));
}
@@ -1846,10 +1872,9 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
- xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+ if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE,
offsetof(struct xfs_ioend, io_inline_bio),
- BIOSET_NEED_BVECS);
- if (!xfs_ioend_bioset)
+ BIOSET_NEED_BVECS))
goto out;
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
@@ -1881,11 +1906,6 @@ xfs_init_zones(void)
if (!xfs_trans_zone)
goto out_destroy_ifork_zone;
- xfs_log_item_desc_zone =
- kmem_zone_init(sizeof(struct xfs_log_item_desc),
- "xfs_log_item_desc");
- if (!xfs_log_item_desc_zone)
- goto out_destroy_trans_zone;
/*
* The size of the zone allocated buf log item is the maximum
@@ -1895,7 +1915,7 @@ xfs_init_zones(void)
xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
"xfs_buf_item");
if (!xfs_buf_item_zone)
- goto out_destroy_log_item_desc_zone;
+ goto out_destroy_trans_zone;
xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1983,8 +2003,6 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_efd_zone);
out_destroy_buf_item_zone:
kmem_zone_destroy(xfs_buf_item_zone);
- out_destroy_log_item_desc_zone:
- kmem_zone_destroy(xfs_log_item_desc_zone);
out_destroy_trans_zone:
kmem_zone_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
@@ -1998,7 +2016,7 @@ xfs_init_zones(void)
out_destroy_log_ticket_zone:
kmem_zone_destroy(xfs_log_ticket_zone);
out_free_ioend_bioset:
- bioset_free(xfs_ioend_bioset);
+ bioset_exit(&xfs_ioend_bioset);
out:
return -ENOMEM;
}
@@ -2023,14 +2041,13 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_efi_zone);
kmem_zone_destroy(xfs_efd_zone);
kmem_zone_destroy(xfs_buf_item_zone);
- kmem_zone_destroy(xfs_log_item_desc_zone);
kmem_zone_destroy(xfs_trans_zone);
kmem_zone_destroy(xfs_ifork_zone);
kmem_zone_destroy(xfs_da_state_zone);
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
kmem_zone_destroy(xfs_log_ticket_zone);
- bioset_free(xfs_ioend_bioset);
+ bioset_exit(&xfs_ioend_bioset);
}
STATIC int __init
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 2e9e793a8f9d..aed03da637d4 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -259,12 +259,13 @@ xfs_symlink(
* bmapi or the directory create code.
*/
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/*
* Allocate an inode for the symlink.
*/
error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, &ip, NULL);
+ prid, &ip);
if (error)
goto out_trans_cancel;
@@ -488,16 +489,11 @@ xfs_inactive_symlink_rmt(
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto error_bmap_cancel;
- /*
- * The first xact was committed, so add the inode to the new one.
- * Mark it dirty so it will be logged and moved forward in the log as
- * part of every commit.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
/*
* Commit the transaction containing extent freeing and EFDs.
*/
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 82afee005140..b53a33e69932 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -95,6 +95,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
+ int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 8b2ccc234f36..2d5cd2529f8e 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -165,9 +165,40 @@ log_recovery_delay_show(
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
+STATIC ssize_t
+mount_delay_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 0 || val > 60)
+ return -EINVAL;
+
+ xfs_globals.mount_delay = val;
+
+ return count;
+}
+
+STATIC ssize_t
+mount_delay_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+}
+XFS_SYSFS_ATTR_RW(mount_delay);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
+ ATTR_LIST(mount_delay),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a982c0b623d0..9d4c4ca24fe6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -441,8 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__field(unsigned, bli_recur)
__field(int, bli_refcount)
__field(unsigned, bli_flags)
- __field(void *, li_desc)
- __field(unsigned, li_flags)
+ __field(unsigned long, li_flags)
),
TP_fast_assign(
__entry->dev = bip->bli_buf->b_target->bt_dev;
@@ -455,12 +454,11 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
- __entry->li_desc = bip->bli_item.li_desc;
__entry->li_flags = bip->bli_item.li_flags;
),
TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
"lock %d flags %s recur %d refcount %d bliflags %s "
- "lidesc %p liflags %s",
+ "liflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->buf_bno,
__entry->buf_len,
@@ -471,7 +469,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->bli_recur,
__entry->bli_refcount,
__print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
- __entry->li_desc,
__print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
)
@@ -506,8 +503,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
DECLARE_EVENT_CLASS(xfs_filestream_class,
- TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
- TP_ARGS(ip, agno),
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno),
+ TP_ARGS(mp, ino, agno),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -515,10 +512,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
__field(int, streams)
),
TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino = ino;
__entry->agno = agno;
- __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ __entry->streams = xfs_filestream_peek_ag(mp, agno);
),
TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -528,8 +525,8 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
)
#define DEFINE_FILESTREAM_EVENT(name) \
DEFINE_EVENT(xfs_filestream_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
- TP_ARGS(ip, agno))
+ TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \
+ TP_ARGS(mp, ino, agno))
DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
@@ -1018,7 +1015,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__field(dev_t, dev)
__field(void *, lip)
__field(uint, type)
- __field(uint, flags)
+ __field(unsigned long, flags)
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
@@ -1070,7 +1067,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
__field(dev_t, dev)
__field(void *, lip)
__field(uint, type)
- __field(uint, flags)
+ __field(unsigned long, flags)
__field(xfs_lsn_t, old_lsn)
__field(xfs_lsn_t, new_lsn)
),
@@ -1750,6 +1747,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(int, namelen)
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
+ __field(int, flags)
__field(int, op_flags)
),
TP_fast_assign(
@@ -1760,10 +1758,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen = args->namelen;
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
+ __entry->flags = args->flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x op_flags %s",
+ "hashval 0x%x flags %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1771,6 +1770,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen,
__entry->valuelen,
__entry->hashval,
+ __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -2243,30 +2243,35 @@ struct xfs_defer_pending;
struct xfs_defer_ops;
DECLARE_EVENT_CLASS(xfs_defer_class,
- TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop),
- TP_ARGS(mp, dop),
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop,
+ unsigned long caller_ip),
+ TP_ARGS(mp, dop, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
__field(char, committed)
__field(char, low)
+ __field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
__entry->dop = dop;
__entry->committed = dop->dop_committed;
__entry->low = dop->dop_low;
+ __entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ops %p committed %d low %d",
+ TP_printk("dev %d:%d ops %p committed %d low %d, caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
- __entry->low)
+ __entry->low,
+ (char *)__entry->caller_ip)
)
#define DEFINE_DEFER_EVENT(name) \
DEFINE_EVENT(xfs_defer_class, name, \
- TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \
- TP_ARGS(mp, dop))
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, dop, caller_ip))
DECLARE_EVENT_CLASS(xfs_defer_error_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error),
@@ -2433,6 +2438,8 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
/* rmap tracepoints */
DECLARE_EVENT_CLASS(xfs_rmap_class,
@@ -3346,6 +3353,43 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logflags)
);
+DECLARE_EVENT_CLASS(xfs_trans_class,
+ TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
+ TP_ARGS(tp, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint32_t, tid)
+ __field(uint32_t, flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = tp->t_mountp->m_super->s_dev;
+ __entry->tid = 0;
+ if (tp->t_ticket)
+ __entry->tid = tp->t_ticket->t_tid;
+ __entry->flags = tp->t_flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d trans %x flags 0x%x caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tid,
+ __entry->flags,
+ (char *)__entry->caller_ip)
+)
+
+#define DEFINE_TRANS_EVENT(name) \
+DEFINE_EVENT(xfs_trans_class, name, \
+ TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
+ TP_ARGS(tp, caller_ip))
+DEFINE_TRANS_EVENT(xfs_trans_alloc);
+DEFINE_TRANS_EVENT(xfs_trans_cancel);
+DEFINE_TRANS_EVENT(xfs_trans_commit);
+DEFINE_TRANS_EVENT(xfs_trans_dup);
+DEFINE_TRANS_EVENT(xfs_trans_free);
+DEFINE_TRANS_EVENT(xfs_trans_roll);
+DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_free_items);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d6d8f9d129a7..fc7ba75b8b69 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -31,9 +31,9 @@
#include "xfs_log.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_defer.h"
kmem_zone_t *xfs_trans_zone;
-kmem_zone_t *xfs_log_item_desc_zone;
#if defined(CONFIG_TRACEPOINTS)
static void
@@ -79,6 +79,7 @@ xfs_trans_free(
xfs_extent_busy_sort(&tp->t_busy);
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
+ trace_xfs_trans_free(tp, _RET_IP_);
atomic_dec(&tp->t_mountp->m_active_trans);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
@@ -94,11 +95,13 @@ xfs_trans_free(
* blocks. Locks and log items, however, are no inherited. They must
* be added to the new transaction explicitly.
*/
-STATIC xfs_trans_t *
+STATIC struct xfs_trans *
xfs_trans_dup(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
- xfs_trans_t *ntp;
+ struct xfs_trans *ntp;
+
+ trace_xfs_trans_dup(tp, _RET_IP_);
ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
@@ -127,6 +130,7 @@ xfs_trans_dup(
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
ntp->t_pflags = tp->t_pflags;
+ ntp->t_agfl_dfops = tp->t_agfl_dfops;
xfs_trans_dup_dqinfo(tp, ntp);
@@ -283,6 +287,8 @@ xfs_trans_alloc(
return error;
}
+ trace_xfs_trans_alloc(tp, _RET_IP_);
+
*tpp = tp;
return 0;
}
@@ -727,73 +733,52 @@ out:
return;
}
-/*
- * Add the given log item to the transaction's list of log items.
- *
- * The log item will now point to its new descriptor with its li_desc field.
- */
+/* Add the given log item to the transaction's list of log items. */
void
xfs_trans_add_item(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
- struct xfs_log_item_desc *lidp;
-
ASSERT(lip->li_mountp == tp->t_mountp);
ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
- lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
-
- lidp->lid_item = lip;
- lidp->lid_flags = 0;
- list_add_tail(&lidp->lid_trans, &tp->t_items);
-
- lip->li_desc = lidp;
-}
-
-STATIC void
-xfs_trans_free_item_desc(
- struct xfs_log_item_desc *lidp)
-{
- list_del_init(&lidp->lid_trans);
- kmem_zone_free(xfs_log_item_desc_zone, lidp);
+ list_add_tail(&lip->li_trans, &tp->t_items);
+ trace_xfs_trans_add_item(tp, _RET_IP_);
}
/*
- * Unlink and free the given descriptor.
+ * Unlink the log item from the transaction. the log item is no longer
+ * considered dirty in this transaction, as the linked transaction has
+ * finished, either by abort or commit completion.
*/
void
xfs_trans_del_item(
struct xfs_log_item *lip)
{
- xfs_trans_free_item_desc(lip->li_desc);
- lip->li_desc = NULL;
+ clear_bit(XFS_LI_DIRTY, &lip->li_flags);
+ list_del_init(&lip->li_trans);
}
-/*
- * Unlock all of the items of a transaction and free all the descriptors
- * of that transaction.
- */
+/* Detach and unlock all of the items in a transaction */
void
xfs_trans_free_items(
struct xfs_trans *tp,
xfs_lsn_t commit_lsn,
bool abort)
{
- struct xfs_log_item_desc *lidp, *next;
-
- list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ struct xfs_log_item *lip, *next;
- lip->li_desc = NULL;
+ trace_xfs_trans_free_items(tp, _RET_IP_);
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ xfs_trans_del_item(lip);
if (commit_lsn != NULLCOMMITLSN)
lip->li_ops->iop_committing(lip, commit_lsn);
if (abort)
- lip->li_flags |= XFS_LI_ABORTED;
+ set_bit(XFS_LI_ABORTED, &lip->li_flags);
lip->li_ops->iop_unlock(lip);
-
- xfs_trans_free_item_desc(lidp);
}
}
@@ -861,7 +846,7 @@ xfs_trans_committed_bulk(
xfs_lsn_t item_lsn;
if (aborted)
- lip->li_flags |= XFS_LI_ABORTED;
+ set_bit(XFS_LI_ABORTED, &lip->li_flags);
item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
/* item_lsn of -1 means the item needs no further processing */
@@ -936,6 +921,11 @@ __xfs_trans_commit(
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
+ ASSERT(!tp->t_agfl_dfops ||
+ !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant);
+
+ trace_xfs_trans_commit(tp, _RET_IP_);
+
/*
* If there is nothing to be logged by the transaction,
* then unlock all of the items associated with the
@@ -991,6 +981,7 @@ out_unreserve:
commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
+ tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
@@ -1022,6 +1013,8 @@ xfs_trans_cancel(
struct xfs_mount *mp = tp->t_mountp;
bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
+ trace_xfs_trans_cancel(tp, _RET_IP_);
+
/*
* See if the caller is relying on us to shut down the
* filesystem. This happens in paths where we detect
@@ -1033,17 +1026,19 @@ xfs_trans_cancel(
}
#ifdef DEBUG
if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
- list_for_each_entry(lidp, &tp->t_items, lid_trans)
- ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
+ list_for_each_entry(lip, &tp->t_items, li_trans)
+ ASSERT(!(lip->li_type == XFS_LI_EFD));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
- if (tp->t_ticket)
+ if (tp->t_ticket) {
xfs_log_done(mp, tp->t_ticket, NULL, false);
+ tp->t_ticket = NULL;
+ }
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1067,6 +1062,8 @@ xfs_trans_roll(
struct xfs_trans_res tres;
int error;
+ trace_xfs_trans_roll(trans, _RET_IP_);
+
/*
* Copy the critical parameters from one trans to the next.
*/
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9d542dfe0052..29706b8b3bd4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -27,7 +27,6 @@ struct xfs_efi_log_item;
struct xfs_inode;
struct xfs_item_ops;
struct xfs_log_iovec;
-struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
struct xfs_trans_res;
@@ -43,12 +42,12 @@ struct xfs_bud_log_item;
typedef struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
+ struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
struct xfs_mount *li_mountp; /* ptr to fs mount */
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
- uint li_flags; /* misc flags */
+ unsigned long li_flags; /* misc flags */
struct xfs_buf *li_buf; /* real buffer pointer */
struct list_head li_bio_list; /* buffer item list */
void (*li_cb)(struct xfs_buf *,
@@ -64,14 +63,21 @@ typedef struct xfs_log_item {
xfs_lsn_t li_seq; /* CIL commit seq */
} xfs_log_item_t;
-#define XFS_LI_IN_AIL 0x1
-#define XFS_LI_ABORTED 0x2
-#define XFS_LI_FAILED 0x4
+/*
+ * li_flags use the (set/test/clear)_bit atomic interfaces because updates can
+ * race with each other and we don't want to have to use the AIL lock to
+ * serialise all updates.
+ */
+#define XFS_LI_IN_AIL 0
+#define XFS_LI_ABORTED 1
+#define XFS_LI_FAILED 2
+#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
#define XFS_LI_FLAGS \
- { XFS_LI_IN_AIL, "IN_AIL" }, \
- { XFS_LI_ABORTED, "ABORTED" }, \
- { XFS_LI_FAILED, "FAILED" }
+ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1 << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1 << XFS_LI_FAILED), "FAILED" }, \
+ { (1 << XFS_LI_DIRTY), "DIRTY" }
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -111,6 +117,7 @@ typedef struct xfs_trans {
struct xlog_ticket *t_ticket; /* log mgr ticket */
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
+ struct xfs_defer_ops *t_agfl_dfops; /* optional agfl fixup dfops */
unsigned int t_flags; /* misc flags */
int64_t t_icount_delta; /* superblock icount change */
int64_t t_ifree_delta; /* superblock ifree change */
@@ -228,7 +235,8 @@ struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *,
uint);
int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
- xfs_extlen_t, struct xfs_owner_info *);
+ xfs_extlen_t, struct xfs_owner_info *,
+ bool);
int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **);
int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
@@ -242,7 +250,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
struct xfs_buf *src_bp);
extern kmem_zone_t *xfs_trans_zone;
-extern kmem_zone_t *xfs_log_item_desc_zone;
/* rmap updates */
enum xfs_rmap_intent_type;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d4a2445215e6..41e280ef1483 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -32,30 +32,51 @@
#ifdef DEBUG
/*
* Check that the list is sorted as it should be.
+ *
+ * Called with the ail lock held, but we don't want to assert fail with it
+ * held otherwise we'll lock everything up and won't be able to debug the
+ * cause. Hence we sample and check the state under the AIL lock and return if
+ * everything is fine, otherwise we drop the lock and run the ASSERT checks.
+ * Asserts may not be fatal, so pick the lock back up and continue onwards.
*/
STATIC void
xfs_ail_check(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
- xfs_log_item_t *prev_lip;
+ struct xfs_log_item *prev_lip;
+ struct xfs_log_item *next_lip;
+ xfs_lsn_t prev_lsn = NULLCOMMITLSN;
+ xfs_lsn_t next_lsn = NULLCOMMITLSN;
+ xfs_lsn_t lsn;
+ bool in_ail;
+
if (list_empty(&ailp->ail_head))
return;
/*
- * Check the next and previous entries are valid.
+ * Sample then check the next and previous entries are valid.
*/
- ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
- prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+ in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
+ prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
if (&prev_lip->li_ail != &ailp->ail_head)
- ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-
- prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
- if (&prev_lip->li_ail != &ailp->ail_head)
- ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+ prev_lsn = prev_lip->li_lsn;
+ next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
+ if (&next_lip->li_ail != &ailp->ail_head)
+ next_lsn = next_lip->li_lsn;
+ lsn = lip->li_lsn;
+ if (in_ail &&
+ (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
+ (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
+ return;
+ spin_unlock(&ailp->ail_lock);
+ ASSERT(in_ail);
+ ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
+ ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
+ spin_lock(&ailp->ail_lock);
}
#else /* !DEBUG */
#define xfs_ail_check(a,l)
@@ -684,7 +705,7 @@ xfs_trans_ail_update_bulk(
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
- if (lip->li_flags & XFS_LI_IN_AIL) {
+ if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
/* check if we really need to move the item */
if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
continue;
@@ -694,7 +715,6 @@ xfs_trans_ail_update_bulk(
if (mlip == lip)
mlip_changed = 1;
} else {
- lip->li_flags |= XFS_LI_IN_AIL;
trace_xfs_ail_insert(lip, 0, lsn);
}
lip->li_lsn = lsn;
@@ -725,7 +745,7 @@ xfs_ail_delete_one(
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
xfs_clear_li_failed(lip);
- lip->li_flags &= ~XFS_LI_IN_AIL;
+ clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
return mlip == lip;
@@ -761,7 +781,7 @@ xfs_trans_ail_delete(
struct xfs_mount *mp = ailp->ail_mount;
bool mlip_changed;
- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 14543d93cd4b..230a21df4b12 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -79,7 +79,7 @@ xfs_trans_log_finish_bmap_update(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
}
@@ -158,7 +158,7 @@ xfs_bmap_update_log_item(
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index a5d9dfc45d98..a8ddb4eed279 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,7 +40,7 @@ xfs_trans_buf_item_match(
struct xfs_buf_map *map,
int nmaps)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
struct xfs_buf_log_item *blip;
int len = 0;
int i;
@@ -48,8 +48,8 @@ xfs_trans_buf_item_match(
for (i = 0; i < nmaps; i++)
len += map[i].bm_len;
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- blip = (struct xfs_buf_log_item *)lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
+ blip = (struct xfs_buf_log_item *)lip;
if (blip->bli_item.li_type == XFS_LI_BUF &&
blip->bli_buf->b_target == target &&
XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn &&
@@ -100,14 +100,10 @@ _xfs_trans_bjoin(
atomic_inc(&bip->bli_refcount);
/*
- * Get a log_item_desc to point at the new item.
+ * Attach the item to the transaction so we can find it in
+ * xfs_trans_get_buf() and friends.
*/
xfs_trans_add_item(tp, &bip->bli_item);
-
- /*
- * Initialize b_fsprivate2 so we can find it with incore_match()
- * in xfs_trans_get_buf() and friends above.
- */
bp->b_transp = tp;
}
@@ -391,7 +387,7 @@ xfs_trans_brelse(
* If the buffer is dirty within this transaction, we can't
* release it until we commit.
*/
- if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
+ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
return;
/*
@@ -442,7 +438,7 @@ xfs_trans_brelse(
ASSERT(bp->b_pincount == 0);
***/
ASSERT(atomic_read(&bip->bli_refcount) == 0);
- ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
xfs_buf_item_relse(bp);
}
@@ -542,7 +538,7 @@ xfs_trans_dirty_buf(
bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
tp->t_flags |= XFS_TRANS_DIRTY;
- bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
}
/*
@@ -626,7 +622,7 @@ xfs_trans_binval(
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
+ ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
return;
}
@@ -642,7 +638,7 @@ xfs_trans_binval(
memset(bip->bli_formats[i].blf_data_map, 0,
(bip->bli_formats[i].blf_map_size * sizeof(uint)));
}
- bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
tp->t_flags |= XFS_TRANS_DIRTY;
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c3d547211d16..c381c02cca45 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -77,7 +77,7 @@ xfs_trans_log_dquot(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
tp->t_flags |= XFS_TRANS_DIRTY;
- dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
/*
@@ -879,7 +879,7 @@ xfs_trans_log_quotaoff_item(
xfs_qoff_logitem_t *qlp)
{
tp->t_flags |= XFS_TRANS_DIRTY;
- qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
}
STATIC void
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index ab438647592a..2f44a08bdf65 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -68,7 +68,8 @@ xfs_trans_free_extent(
struct xfs_efd_log_item *efdp,
xfs_fsblock_t start_block,
xfs_extlen_t ext_len,
- struct xfs_owner_info *oinfo)
+ struct xfs_owner_info *oinfo,
+ bool skip_discard)
{
struct xfs_mount *mp = tp->t_mountp;
uint next_extent;
@@ -79,9 +80,8 @@ xfs_trans_free_extent(
trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
- error = xfs_free_extent(tp, start_block, ext_len, oinfo,
- XFS_AG_RESV_NONE);
-
+ error = __xfs_free_extent(tp, start_block, ext_len,
+ oinfo, XFS_AG_RESV_NONE, skip_discard);
/*
* Mark the transaction dirty, even on error. This ensures the
* transaction is aborted, which:
@@ -90,7 +90,7 @@ xfs_trans_free_extent(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
@@ -155,7 +155,7 @@ xfs_extent_free_log_item(
free = container_of(item, struct xfs_extent_free_item, xefi_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;
@@ -195,7 +195,7 @@ xfs_extent_free_finish_item(
error = xfs_trans_free_extent(tp, done_item,
free->xefi_startblock,
free->xefi_blockcount,
- &free->xefi_oinfo);
+ &free->xefi_oinfo, free->xefi_skip_discard);
kmem_free(free);
return error;
}
@@ -231,9 +231,79 @@ static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.cancel_item = xfs_extent_free_cancel_item,
};
+/*
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
+ */
+STATIC int
+xfs_agfl_free_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efd_log_item *efdp = done_item;
+ struct xfs_extent_free_item *free;
+ struct xfs_extent *extp;
+ struct xfs_buf *agbp;
+ int error;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ uint next_extent;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ ASSERT(free->xefi_blockcount == 1);
+ agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+
+ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ if (!error)
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp,
+ &free->xefi_oinfo);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+ efdp->efd_next_extent++;
+
+ kmem_free(free);
+ return error;
+}
+
+
+/* sub-type with special handling for AGFL deferred frees */
+static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .type = XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+};
+
/* Register the deferred op type. */
void
xfs_extent_free_init_defer_op(void)
{
xfs_defer_init_op_type(&xfs_extent_free_defer_type);
+ xfs_defer_init_op_type(&xfs_agfl_free_defer_type);
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 07cea592dc01..f7bd7960a90f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -133,14 +133,13 @@ xfs_trans_log_inode(
* set however, then go ahead and bump the i_version counter
* unconditionally.
*/
- if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+ if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
IS_I_VERSION(VFS_I(ip))) {
if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
flags |= XFS_ILOG_CORE;
}
tp->t_flags |= XFS_TRANS_DIRTY;
- ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
/*
* Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index be24b0c8a332..9717ae74b36d 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,7 +19,6 @@
#define __XFS_TRANS_PRIV_H__
struct xfs_log_item;
-struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
struct xfs_ail;
@@ -119,7 +118,7 @@ xfs_trans_ail_remove(
spin_lock(&ailp->ail_lock);
/* xfs_trans_ail_delete() drops the AIL lock */
- if (lip->li_flags & XFS_LI_IN_AIL)
+ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags))
xfs_trans_ail_delete(ailp, lip, shutdown_type);
else
spin_unlock(&ailp->ail_lock);
@@ -171,11 +170,10 @@ xfs_clear_li_failed(
{
struct xfs_buf *bp = lip->li_buf;
- ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
lockdep_assert_held(&lip->li_ailp->ail_lock);
- if (lip->li_flags & XFS_LI_FAILED) {
- lip->li_flags &= ~XFS_LI_FAILED;
+ if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
lip->li_buf = NULL;
xfs_buf_rele(bp);
}
@@ -188,9 +186,8 @@ xfs_set_li_failed(
{
lockdep_assert_held(&lip->li_ailp->ail_lock);
- if (!(lip->li_flags & XFS_LI_FAILED)) {
+ if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
xfs_buf_hold(bp);
- lip->li_flags |= XFS_LI_FAILED;
lip->li_buf = bp;
}
}
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 94c1877af834..c7f8e82f5bda 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -77,7 +77,7 @@ xfs_trans_log_finish_refcount_update(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
}
@@ -154,7 +154,7 @@ xfs_refcount_update_log_item(
refc = container_of(item, struct xfs_refcount_intent, ri_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 9b577beb43d7..5831ca0c270b 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -117,7 +117,7 @@ xfs_trans_log_finish_rmap_update(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
}
@@ -175,7 +175,7 @@ xfs_rmap_update_log_item(
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;