aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/Kconfig.binfmt15
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/cmservice.c25
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/afs/write.c18
-rw-r--r--fs/binfmt_em86.c110
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/btrfs/backref.c6
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/block-group.c367
-rw-r--r--fs/btrfs/block-group.h6
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c67
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/inode.c159
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/qgroup.c38
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c20
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-log.c37
-rw-r--r--fs/btrfs/volumes.c356
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zoned.c12
-rw-r--r--fs/ceph/addr.c26
-rw-r--r--fs/ceph/caps.c142
-rw-r--r--fs/ceph/debugfs.c37
-rw-r--r--fs/ceph/dir.c16
-rw-r--r--fs/ceph/export.c1
-rw-r--r--fs/ceph/file.c24
-rw-r--r--fs/ceph/inode.c38
-rw-r--r--fs/ceph/mds_client.c81
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/metric.c167
-rw-r--r--fs/ceph/metric.h89
-rw-r--r--fs/ceph/quota.c9
-rw-r--r--fs/ceph/snap.c79
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/cifs/cifs_dfs_ref.c9
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h18
-rw-r--r--fs/cifs/cifspdu.h1
-rw-r--r--fs/cifs/cifssmb.c34
-rw-r--r--fs/cifs/connect.c119
-rw-r--r--fs/cifs/dfs_cache.c229
-rw-r--r--fs/cifs/dfs_cache.h3
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/dns_resolve.c10
-rw-r--r--fs/cifs/dns_resolve.h2
-rw-r--r--fs/cifs/file.c37
-rw-r--r--fs/cifs/fs_context.c38
-rw-r--r--fs/cifs/fs_context.h1
-rw-r--r--fs/cifs/inode.c19
-rw-r--r--fs/cifs/misc.c52
-rw-r--r--fs/cifs/smb2ops.c54
-rw-r--r--fs/cifs/smb2pdu.c87
-rw-r--r--fs/cifs/smb2pdu.h8
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/configfs/file.c192
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/d_path.c324
-rw-r--r--fs/dax.c2
-rw-r--r--fs/debugfs/file.c38
-rw-r--r--fs/exec.c3
-rw-r--r--fs/exfat/dir.c8
-rw-r--r--fs/exfat/super.c2
-rw-r--r--fs/ext2/dir.c12
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/namei.c4
-rw-r--r--fs/ext4/ext4_jbd2.c5
-rw-r--r--fs/ext4/ioctl.c16
-rw-r--r--fs/ext4/mballoc.c9
-rw-r--r--fs/ext4/mmp.c33
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/resize.c4
-rw-r--r--fs/ext4/super.c26
-rw-r--r--fs/f2fs/checkpoint.c4
-rw-r--r--fs/f2fs/compress.c255
-rw-r--r--fs/f2fs/data.c360
-rw-r--r--fs/f2fs/debug.c13
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/f2fs.h228
-rw-r--r--fs/f2fs/file.c37
-rw-r--r--fs/f2fs/gc.c16
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c23
-rw-r--r--fs/f2fs/namei.c36
-rw-r--r--fs/f2fs/node.c35
-rw-r--r--fs/f2fs/node.h33
-rw-r--r--fs/f2fs/recovery.c29
-rw-r--r--fs/f2fs/segment.c34
-rw-r--r--fs/f2fs/super.c188
-rw-r--r--fs/f2fs/sysfs.c232
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/fs-writeback.c3
-rw-r--r--fs/fs_context.c54
-rw-r--r--fs/fuse/dax.c12
-rw-r--r--fs/fuse/dev.c14
-rw-r--r--fs/fuse/dir.c63
-rw-r--r--fs/fuse/file.c18
-rw-r--r--fs/fuse/fuse_i.h24
-rw-r--r--fs/fuse/inode.c100
-rw-r--r--fs/fuse/readdir.c7
-rw-r--r--fs/fuse/virtio_fs.c4
-rw-r--r--fs/hfs/bfind.c14
-rw-r--r--fs/hfs/bnode.c25
-rw-r--r--fs/hfs/btree.h7
-rw-r--r--fs/hfs/super.c10
-rw-r--r--fs/hfsplus/inode.c5
-rw-r--r--fs/hfsplus/xattr.c1
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io-wq.c96
-rw-r--r--fs/io_uring.c360
-rw-r--r--fs/iomap/buffered-io.c43
-rw-r--r--fs/iomap/seek.c25
-rw-r--r--fs/jbd2/checkpoint.c4
-rw-r--r--fs/jbd2/journal.c149
-rw-r--r--fs/jfs/inode.c3
-rw-r--r--fs/jfs/jfs_dinode.h14
-rw-r--r--fs/jfs/jfs_dmap.c2
-rw-r--r--fs/jfs/jfs_imap.c8
-rw-r--r--fs/jfs/jfs_incore.h12
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/kernel_read_file.c2
-rw-r--r--fs/kernfs/dir.c86
-rw-r--r--fs/lockd/svc.c43
-rw-r--r--fs/lockd/svcxdr.h151
-rw-r--r--fs/lockd/xdr.c402
-rw-r--r--fs/lockd/xdr4.c403
-rw-r--r--fs/namei.c80
-rw-r--r--fs/namespace.c42
-rw-r--r--fs/nfs/delegation.c94
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/direct.c17
-rw-r--r--fs/nfs/fscache.c18
-rw-r--r--fs/nfs/getroot.c12
-rw-r--r--fs/nfs/inode.c61
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c82
-rw-r--r--fs/nfs/nfs4file.c8
-rw-r--r--fs/nfs/nfs4proc.c114
-rw-r--r--fs/nfs/nfs4trace.h6
-rw-r--r--fs/nfs/nfstrace.h8
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c68
-rw-r--r--fs/nfs/pnfs_nfs.c54
-rw-r--r--fs/nfs/read.c20
-rw-r--r--fs/nfs/write.c6
-rw-r--r--fs/nfs_common/grace.c1
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/netns.h6
-rw-r--r--fs/nfsd/nfs3acl.c3
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c154
-rw-r--r--fs/nfsd/nfs4state.c179
-rw-r--r--fs/nfsd/nfsd.h4
-rw-r--r--fs/nfsd/nfsfh.h7
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/nfsd/trace.h250
-rw-r--r--fs/nfsd/vfs.c26
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/btree.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c17
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/ntfs/file.c33
-rw-r--r--fs/ocfs2/file.c103
-rw-r--r--fs/open.c17
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c47
-rw-r--r--fs/overlayfs/readdir.c5
-rw-r--r--fs/pipe.c32
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/bootconfig.c2
-rw-r--r--fs/proc/fd.c20
-rw-r--r--fs/proc/kcore.c67
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/task_mmu.c34
-rw-r--r--fs/reiserfs/stree.c31
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/seq_file.c46
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/dir.c7
-rw-r--r--fs/ubifs/journal.c3
-rw-r--r--fs/ubifs/master.c2
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c3
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c48
-rw-r--r--fs/userfaultfd.c41
-rw-r--r--fs/vboxsf/dir.c76
-rw-r--r--fs/vboxsf/file.c71
-rw-r--r--fs/vboxsf/vfsmod.h7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c288
-rw-r--r--fs/xfs/libxfs/xfs_ag.h136
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c11
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c111
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c31
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h9
-rw-r--r--fs/xfs/libxfs/xfs_attr.c910
-rw-r--r--fs/xfs/libxfs/xfs_attr.h403
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c167
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h1
-rw-r--r--fs/xfs/libxfs/xfs_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c692
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h43
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c46
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c30
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h14
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c122
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h9
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c39
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c147
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h6
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c46
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h6
-rw-r--r--fs/xfs/libxfs/xfs_sb.c146
-rw-r--r--fs/xfs/libxfs/xfs_sb.h9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h20
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c10
-rw-r--r--fs/xfs/libxfs/xfs_types.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h1
-rw-r--r--fs/xfs/scrub/agheader.c1
-rw-r--r--fs/xfs/scrub/agheader_repair.c33
-rw-r--r--fs/xfs/scrub/alloc.c3
-rw-r--r--fs/xfs/scrub/bmap.c21
-rw-r--r--fs/xfs/scrub/common.c15
-rw-r--r--fs/xfs/scrub/fscounters.c42
-rw-r--r--fs/xfs/scrub/health.c2
-rw-r--r--fs/xfs/scrub/ialloc.c9
-rw-r--r--fs/xfs/scrub/inode.c18
-rw-r--r--fs/xfs/scrub/refcount.c3
-rw-r--r--fs/xfs/scrub/repair.c14
-rw-r--r--fs/xfs/scrub/rmap.c3
-rw-r--r--fs/xfs/scrub/trace.c3
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_bio_io.c35
-rw-r--r--fs/xfs/xfs_bmap_util.c6
-rw-r--r--fs/xfs/xfs_buf.c311
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_buf_item.c97
-rw-r--r--fs/xfs/xfs_buf_item_recover.c15
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_extent_busy.c33
-rw-r--r--fs/xfs/xfs_extent_busy.h7
-rw-r--r--fs/xfs/xfs_file.c70
-rw-r--r--fs/xfs/xfs_filestream.c2
-rw-r--r--fs/xfs/xfs_fsmap.c80
-rw-r--r--fs/xfs/xfs_fsops.c24
-rw-r--r--fs/xfs/xfs_health.c15
-rw-r--r--fs/xfs/xfs_icache.c1156
-rw-r--r--fs/xfs/xfs_icache.h58
-rw-r--r--fs/xfs/xfs_inode.c247
-rw-r--r--fs/xfs/xfs_inode.h9
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inode_item_recover.c39
-rw-r--r--fs/xfs/xfs_ioctl.c70
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_iwalk.c84
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c492
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c149
-rw-r--r--fs/xfs/xfs_log_priv.h49
-rw-r--r--fs/xfs/xfs_log_recover.c61
-rw-r--r--fs/xfs/xfs_mount.c136
-rw-r--r--fs/xfs/xfs_mount.h110
-rw-r--r--fs/xfs/xfs_qm.c10
-rw-r--r--fs/xfs/xfs_qm.h1
-rw-r--r--fs/xfs/xfs_qm_syscalls.c54
-rw-r--r--fs/xfs/xfs_reflink.c13
-rw-r--r--fs/xfs/xfs_rtalloc.c49
-rw-r--r--fs/xfs/xfs_super.c10
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_symlink.c9
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h118
-rw-r--r--fs/xfs/xfs_trans.c6
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/zonefs/super.c3
305 files changed, 10121 insertions, 6631 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 141a856c50e7..a7749c126b8e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -240,6 +240,21 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+config HUGETLB_PAGE_FREE_VMEMMAP
+ def_bool HUGETLB_PAGE
+ depends on X86_64
+ depends on SPARSEMEM_VMEMMAP
+
+config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
+ bool "Default freeing vmemmap pages of HugeTLB to on"
+ default n
+ depends on HUGETLB_PAGE_FREE_VMEMMAP
+ help
+ When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+ pages associated with each HugeTLB page is default off. Say Y here
+ to enable freeing vmemmap pages of HugeTLB by default. It can then
+ be disabled on the command line via hugetlb_free_vmemmap=off.
+
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 06fb7a93a1bd..4d5ae61580aa 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -168,21 +168,6 @@ config OSF4_COMPAT
with v4 shared libraries freely available from Compaq. If you're
going to use shared libraries from Tru64 version 5.0 or later, say N.
-config BINFMT_EM86
- tristate "Kernel support for Linux/Intel ELF binaries"
- depends on ALPHA
- help
- Say Y here if you want to be able to execute Linux/Intel ELF
- binaries just like native Alpha binaries on your Alpha machine. For
- this to work, you need to have the emulator /usr/bin/em86 in place.
-
- You can get the same functionality by saying N here and saying Y to
- "Kernel support for MISC binaries".
-
- You may answer M to compile the emulation support as a module and
- later load the module when you want to use a Linux/Intel binary. The
- module will be called binfmt_em86. If unsure, say Y.
-
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
help
diff --git a/fs/Makefile b/fs/Makefile
index 9c708e1fbe8f..f98f3e691c37 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,7 +39,6 @@ obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
-obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d3c6bb22c5f4..a3f5de28be79 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -29,16 +29,11 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
static int afs_deliver_yfs_cb_callback(struct afs_call *);
-#define CM_NAME(name) \
- char afs_SRXCB##name##_name[] __tracepoint_string = \
- "CB." #name
-
/*
* CB.CallBack operation type
*/
-static CM_NAME(CallBack);
static const struct afs_call_type afs_SRXCBCallBack = {
- .name = afs_SRXCBCallBack_name,
+ .name = "CB.CallBack",
.deliver = afs_deliver_cb_callback,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_CallBack,
@@ -47,9 +42,8 @@ static const struct afs_call_type afs_SRXCBCallBack = {
/*
* CB.InitCallBackState operation type
*/
-static CM_NAME(InitCallBackState);
static const struct afs_call_type afs_SRXCBInitCallBackState = {
- .name = afs_SRXCBInitCallBackState_name,
+ .name = "CB.InitCallBackState",
.deliver = afs_deliver_cb_init_call_back_state,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_InitCallBackState,
@@ -58,9 +52,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState = {
/*
* CB.InitCallBackState3 operation type
*/
-static CM_NAME(InitCallBackState3);
static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
- .name = afs_SRXCBInitCallBackState3_name,
+ .name = "CB.InitCallBackState3",
.deliver = afs_deliver_cb_init_call_back_state3,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_InitCallBackState,
@@ -69,9 +62,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
/*
* CB.Probe operation type
*/
-static CM_NAME(Probe);
static const struct afs_call_type afs_SRXCBProbe = {
- .name = afs_SRXCBProbe_name,
+ .name = "CB.Probe",
.deliver = afs_deliver_cb_probe,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_Probe,
@@ -80,9 +72,8 @@ static const struct afs_call_type afs_SRXCBProbe = {
/*
* CB.ProbeUuid operation type
*/
-static CM_NAME(ProbeUuid);
static const struct afs_call_type afs_SRXCBProbeUuid = {
- .name = afs_SRXCBProbeUuid_name,
+ .name = "CB.ProbeUuid",
.deliver = afs_deliver_cb_probe_uuid,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_ProbeUuid,
@@ -91,9 +82,8 @@ static const struct afs_call_type afs_SRXCBProbeUuid = {
/*
* CB.TellMeAboutYourself operation type
*/
-static CM_NAME(TellMeAboutYourself);
static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
- .name = afs_SRXCBTellMeAboutYourself_name,
+ .name = "CB.TellMeAboutYourself",
.deliver = afs_deliver_cb_tell_me_about_yourself,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_TellMeAboutYourself,
@@ -102,9 +92,8 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
/*
* YFS CB.CallBack operation type
*/
-static CM_NAME(YFS_CallBack);
static const struct afs_call_type afs_SRXYFSCB_CallBack = {
- .name = afs_SRXCBYFS_CallBack_name,
+ .name = "YFSCB.CallBack",
.deliver = afs_deliver_yfs_cb_callback,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_CallBack,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 78719f2f567e..ac829e63c570 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -656,7 +656,6 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
return ret;
}
- ret = -ENOENT;
if (!cookie.found) {
_leave(" = -ENOENT [not found]");
return -ENOENT;
@@ -2020,17 +2019,20 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (d_count(new_dentry) > 2) {
/* copy the target dentry's name */
- ret = -ENOMEM;
op->rename.tmp = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
- if (!op->rename.tmp)
+ if (!op->rename.tmp) {
+ op->error = -ENOMEM;
goto error;
+ }
ret = afs_sillyrename(new_dvnode,
AFS_FS_I(d_inode(new_dentry)),
new_dentry, op->key);
- if (ret)
+ if (ret) {
+ op->error = ret;
goto error;
+ }
op->dentry_2 = op->rename.tmp;
op->rename.rehash = NULL;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3104b62c2082..c0534697268e 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -771,14 +771,20 @@ int afs_writepages(struct address_space *mapping,
if (wbc->range_cyclic) {
start = mapping->writeback_index * PAGE_SIZE;
ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
- if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
- ret = afs_writepages_region(mapping, wbc, 0, start,
- &next);
- mapping->writeback_index = next / PAGE_SIZE;
+ if (ret == 0) {
+ mapping->writeback_index = next / PAGE_SIZE;
+ if (start > 0 && wbc->nr_to_write > 0) {
+ ret = afs_writepages_region(mapping, wbc, 0,
+ start, &next);
+ if (ret == 0)
+ mapping->writeback_index =
+ next / PAGE_SIZE;
+ }
+ }
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
- if (wbc->nr_to_write > 0)
- mapping->writeback_index = next;
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = next / PAGE_SIZE;
} else {
ret = afs_writepages_region(mapping, wbc,
wbc->range_start, wbc->range_end, &next);
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
deleted file mode 100644
index 06b9b9fddf70..000000000000
--- a/fs/binfmt_em86.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/binfmt_em86.c
- *
- * Based on linux/fs/binfmt_script.c
- * Copyright (C) 1996 Martin von Löwis
- * original #!-checking implemented by tytso.
- *
- * em86 changes Copyright (C) 1997 Jim Paradis
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/binfmts.h>
-#include <linux/elf.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/errno.h>
-
-
-#define EM86_INTERP "/usr/bin/em86"
-#define EM86_I_NAME "em86"
-
-static int load_em86(struct linux_binprm *bprm)
-{
- const char *i_name, *i_arg;
- char *interp;
- struct file * file;
- int retval;
- struct elfhdr elf_ex;
-
- /* Make sure this is a Linux/Intel ELF executable... */
- elf_ex = *((struct elfhdr *)bprm->buf);
-
- if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
- return -ENOEXEC;
-
- /* First of all, some simple consistency checks */
- if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
- (!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
- !bprm->file->f_op->mmap) {
- return -ENOEXEC;
- }
-
- /* Need to be able to load the file after exec */
- if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
- return -ENOENT;
-
- /* Unlike in the script case, we don't have to do any hairy
- * parsing to find our interpreter... it's hardcoded!
- */
- interp = EM86_INTERP;
- i_name = EM86_I_NAME;
- i_arg = NULL; /* We reserve the right to add an arg later */
-
- /*
- * Splice in (1) the interpreter's name for argv[0]
- * (2) (optional) argument to interpreter
- * (3) filename of emulated file (replace argv[0])
- *
- * This is done in reverse order, because of how the
- * user environment and arguments are stored.
- */
- remove_arg_zero(bprm);
- retval = copy_string_kernel(bprm->filename, bprm);
- if (retval < 0) return retval;
- bprm->argc++;
- if (i_arg) {
- retval = copy_string_kernel(i_arg, bprm);
- if (retval < 0) return retval;
- bprm->argc++;
- }
- retval = copy_string_kernel(i_name, bprm);
- if (retval < 0) return retval;
- bprm->argc++;
-
- /*
- * OK, now restart the process with the interpreter's inode.
- * Note that we use open_exec() as the name is now in kernel
- * space, and we don't need to copy it.
- */
- file = open_exec(interp);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- bprm->interpreter = file;
- return 0;
-}
-
-static struct linux_binfmt em86_format = {
- .module = THIS_MODULE,
- .load_binary = load_em86,
-};
-
-static int __init init_em86_binfmt(void)
-{
- register_binfmt(&em86_format);
- return 0;
-}
-
-static void __exit exit_em86_binfmt(void)
-{
- unregister_binfmt(&em86_format);
-}
-
-core_initcall(init_em86_binfmt);
-module_exit(exit_em86_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7e83c3e71504..9ef4f1fc2cb0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -812,6 +812,8 @@ static void bdev_free_inode(struct inode *inode)
free_percpu(bdev->bd_stats);
kfree(bdev->bd_meta_info);
+ if (!bdev_is_partition(bdev))
+ kfree(bdev->bd_disk);
kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
@@ -1609,7 +1611,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
* Does not take i_mutex for the write and thus is not for general purpose
* use.
*/
-ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = bdev_file_inode(file);
@@ -1647,9 +1649,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_finish_plug(&plug);
return ret;
}
-EXPORT_SYMBOL_GPL(blkdev_write_iter);
-ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = bdev_file_inode(file);
@@ -1671,7 +1672,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
return ret;
}
-EXPORT_SYMBOL_GPL(blkdev_read_iter);
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7a8a2fc19533..78b202d198b8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1488,15 +1488,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
- bool ignore_offset)
+ bool ignore_offset, bool skip_commit_root_sem)
{
int ret;
- if (!trans)
+ if (!trans && !skip_commit_root_sem)
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
time_seq, roots, ignore_offset);
- if (!trans)
+ if (!trans && !skip_commit_root_sem)
up_read(&fs_info->commit_root_sem);
return ret;
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 17abde7f794c..ff5f07f9940b 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -47,7 +47,8 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots, bool ignore_offset);
+ u64 time_seq, struct ulist **roots, bool ignore_offset,
+ bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 38b127b9edfc..9e7d9d0c763d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1498,9 +1498,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
- mutex_lock(&fs_info->reclaim_bgs_lock);
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip reclaim if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+ btrfs_exclop_finish(fs_info);
+ return;
+ }
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->reclaim_bgs)) {
+ u64 zone_unusable;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1534,13 +1543,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the blog group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore.
+ */
+ zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
- btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
- bg->start, div_u64(bg->used * 100, bg->length));
+ btrfs_info(fs_info,
+ "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+ bg->start, div_u64(bg->used * 100, bg->length),
+ div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret)
@@ -2197,6 +2215,13 @@ error:
return ret;
}
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
@@ -2219,15 +2244,19 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group;
int ret = 0;
- if (!trans->can_flush_pending_bgs)
- return;
-
while (!list_empty(&trans->new_bgs)) {
int index;
@@ -2242,6 +2271,13 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
+ if (!block_group->chunk_item_inserted) {
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
ret = btrfs_finish_chunk_alloc(trans, block_group->start,
block_group->length);
if (ret)
@@ -2265,8 +2301,9 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size)
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2276,7 +2313,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
if (!cache)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
cache->length = size;
set_free_space_tree_thresholds(cache);
@@ -2290,7 +2327,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
ret = btrfs_load_block_group_zone_info(cache, true);
if (ret) {
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
ret = exclude_super_stripes(cache);
@@ -2298,7 +2335,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
/* We may have excluded something, so call this just in case */
btrfs_free_excluded_extents(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
add_new_free_space(cache, chunk_offset, chunk_offset + size);
@@ -2325,7 +2362,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
/*
@@ -2344,7 +2381,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
btrfs_update_delayed_refs_rsv(trans);
set_avail_alloc_bits(fs_info, type);
- return 0;
+ return cache;
}
/*
@@ -3222,11 +3259,203 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
+static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+{
+ struct btrfs_block_group *bg;
+ int ret;
+
+ /*
+ * Check if we have enough space in the system space info because we
+ * will need to update device items in the chunk btree and insert a new
+ * chunk item in the chunk btree as well. This will allocate a new
+ * system block group if needed.
+ */
+ check_system_chunk(trans, flags);
+
+ bg = btrfs_alloc_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ goto out;
+ }
+
+ /*
+ * If this is a system chunk allocation then stop right here and do not
+ * add the chunk item to the chunk btree. This is to prevent a deadlock
+ * because this system chunk allocation can be triggered while COWing
+ * some extent buffer of the chunk btree and while holding a lock on a
+ * parent extent buffer, in which case attempting to insert the chunk
+ * item (or update the device item) would result in a deadlock on that
+ * parent extent buffer. In this case defer the chunk btree updates to
+ * the second phase of chunk allocation and keep our reservation until
+ * the second phase completes.
+ *
+ * This is a rare case and can only be triggered by the very few cases
+ * we have where we need to touch the chunk btree outside chunk allocation
+ * and chunk removal. These cases are basically adding a device, removing
+ * a device or resizing a device.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return 0;
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ /*
+ * Normally we are not expected to fail with -ENOSPC here, since we have
+ * previously reserved space in the system space_info and allocated one
+ * new system chunk if necessary. However there are two exceptions:
+ *
+ * 1) We may have enough free space in the system space_info but all the
+ * existing system block groups have a profile which can not be used
+ * for extent allocation.
+ *
+ * This happens when mounting in degraded mode. For example we have a
+ * RAID1 filesystem with 2 devices, lose one device and mount the fs
+ * using the other device in degraded mode. If we then allocate a chunk,
+ * we may have enough free space in the existing system space_info, but
+ * none of the block groups can be used for extent allocation since they
+ * have a RAID1 profile, and because we are in degraded mode with a
+ * single device, we are forced to allocate a new system chunk with a
+ * SINGLE profile. Making check_system_chunk() iterate over all system
+ * block groups and check if they have a usable profile and enough space
+ * can be slow on very large filesystems, so we tolerate the -ENOSPC and
+ * try again after forcing allocation of a new system chunk. Like this
+ * we avoid paying the cost of that search in normal circumstances, when
+ * we were not mounted in degraded mode;
+ *
+ * 2) We had enough free space info the system space_info, and one suitable
+ * block group to allocate from when we called check_system_chunk()
+ * above. However right after we called it, the only system block group
+ * with enough free space got turned into RO mode by a running scrub,
+ * and in this case we have to allocate a new one and retry. We only
+ * need do this allocate and retry once, since we have a transaction
+ * handle and scrub uses the commit root to search for block groups.
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
+}
+
/*
- * If force is CHUNK_ALLOC_FORCE:
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ * the chunk, the chunk mapping, create its block group and add the items
+ * that belong in the chunk btree to it - more specifically, we need to
+ * update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ * group item to the extent btree and the device extent items to the devices
+ * btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ * metadata space_info and therefore don't trigger allocation of a new chunk.
+ * However later when the task actually tries to COW an extent buffer from
+ * the extent btree or from the device btree for example, it is forced to
+ * allocate a new block group (chunk) because the only one that had enough
+ * free space was just turned to RO mode by a running scrub for example (or
+ * device replace, block group reclaim thread, etc), so we can not use it
+ * for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ * we end up not allocating a new metadata chunk in that case. However if
+ * the filesystem was mounted in degraded mode, none of the existing block
+ * groups might be suitable for extent allocation due to their incompatible
+ * profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ * use a RAID1 profile, in degraded mode using a single device). In this case
+ * when the task attempts to COW some extent buffer of the extent btree for
+ * example, it will trigger allocation of a new metadata block group with a
+ * suitable profile (SINGLE profile in the example of the degraded mount of
+ * the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ * it attempts to COW an extent buffer from the extent or device btree for
+ * example, it does not find any free extent in any metadata block group,
+ * therefore forced to try to allocate a new metadata block group.
+ * This is because some other task allocated all available extents in the
+ * meanwhile - this typically happens with tasks that don't reserve space
+ * properly, either intentionally or as a bug. One example where this is
+ * done intentionally is fsync, as it does not reserve any transaction units
+ * and ends up allocating a variable number of metadata extents for log
+ * tree extent buffers.
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * For allocation of system chunks, we defer the updates and insertions into the
+ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
+ * if the chunk allocation is triggered while COWing an extent buffer of the
+ * chunk btree, we are holding a lock on the parent of that extent buffer and
+ * doing the chunk btree updates and insertions can require locking that parent.
+ * This is for the very few and rare cases where we update the chunk btree that
+ * are not chunk allocation or chunk removal: adding a device, removing a device
+ * or resizing a device.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
+ * If @force is NOT CHUNK_ALLOC_FORCE:
* - return 0 if it doesn't need to allocate a new chunk,
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
@@ -3243,6 +3472,13 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
+ /*
+ * If we are removing a chunk, don't re-enter or we would deadlock.
+ * System space reservation and system chunk allocation is done by the
+ * chunk remove operation (btrfs_remove_chunk()).
+ */
+ if (trans->removing_chunk)
+ return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
ASSERT(space_info);
@@ -3306,13 +3542,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- /*
- * Check if we have enough space in SYSTEM chunk because we may need
- * to update devices.
- */
- check_system_chunk(trans, flags);
-
- ret = btrfs_alloc_chunk(trans, flags);
+ ret = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
spin_lock(&space_info->lock);
@@ -3331,22 +3561,6 @@ out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
- /*
- * When we allocate a new chunk we reserve space in the chunk block
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
- * add new nodes/leafs to it if we end up needing to do it when
- * inserting the chunk item and updating device items as part of the
- * second phase of chunk allocation, performed by
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
- * large number of new block groups to create in our transaction
- * handle's new_bgs list to avoid exhausting the chunk block reserve
- * in extreme cases - like having a single transaction create many new
- * block groups when starting to write out the free space caches of all
- * the block groups that were made dirty during the lifetime of the
- * transaction.
- */
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
- btrfs_create_pending_block_groups(trans);
return ret;
}
@@ -3367,7 +3581,6 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
- struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
@@ -3382,7 +3595,6 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-again:
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
@@ -3401,76 +3613,39 @@ again:
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
- u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
-
- /*
- * If there's not available space for the chunk tree (system
- * space) and there are other tasks that reserved space for
- * creating a new system block group, wait for them to complete
- * the creation of their system block group and release excess
- * reserved space. We do this because:
- *
- * *) We can end up allocating more system chunks than necessary
- * when there are multiple tasks that are concurrently
- * allocating block groups, which can lead to exhaustion of
- * the system array in the superblock;
- *
- * *) If we allocate extra and unnecessary system block groups,
- * despite being empty for a long time, and possibly forever,
- * they end not being added to the list of unused block groups
- * because that typically happens only when deallocating the
- * last extent from a block group - which never happens since
- * we never allocate from them in the first place. The few
- * exceptions are when mounting a filesystem or running scrub,
- * which add unused block groups to the list of unused block
- * groups, to be deleted by the cleaner kthread.
- * And even when they are added to the list of unused block
- * groups, it can take a long time until they get deleted,
- * since the cleaner kthread might be sleeping or busy with
- * other work (deleting subvolumes, running delayed iputs,
- * defrag scheduling, etc);
- *
- * This is rare in practice, but can happen when too many tasks
- * are allocating blocks groups in parallel (via fallocate())
- * and before the one that reserved space for a new system block
- * group finishes the block group creation and releases the space
- * reserved in excess (at btrfs_create_pending_block_groups()),
- * other tasks end up here and see free system space temporarily
- * not enough for updating the chunk tree.
- *
- * We unlock the chunk mutex before waiting for such tasks and
- * lock it again after the wait, otherwise we would deadlock.
- * It is safe to do so because allocating a system chunk is the
- * first thing done while allocating a new block group.
- */
- if (reserved > trans->chunk_bytes_reserved) {
- const u64 min_needed = reserved - thresh;
-
- mutex_unlock(&fs_info->chunk_mutex);
- wait_event(cur_trans->chunk_reserve_wait,
- atomic64_read(&cur_trans->chunk_bytes_reserved) <=
- min_needed);
- mutex_lock(&fs_info->chunk_mutex);
- goto again;
- }
+ struct btrfs_block_group *bg;
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
+ *
+ * Also, if our caller is allocating a system chunk, do not
+ * attempt to insert the chunk item in the chunk btree, as we
+ * could deadlock on an extent buffer since our caller may be
+ * COWing an extent buffer from the chunk btree.
*/
- ret = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_alloc_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ /*
+ * If we fail to add the chunk item here, we end up
+ * trying again at phase 2 of chunk allocation, at
+ * btrfs_create_pending_block_groups(). So ignore
+ * any error here.
+ */
+ btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ }
}
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
- if (!ret) {
- atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
+ if (!ret)
trans->chunk_bytes_reserved += thresh;
- }
}
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 7b927425dc71..c72a71efcb18 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -97,6 +97,7 @@ struct btrfs_block_group {
unsigned int removed:1;
unsigned int to_copy:1;
unsigned int relocating_repair:1;
+ unsigned int chunk_item_inserted:1;
int disk_cache_state;
@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9a023ae0f98b..30d82cdf128c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -352,7 +352,7 @@ static void end_compressed_bio_write(struct bio *bio)
btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- bio->bi_status == BLK_STS_OK);
+ !cb->errors);
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc3ca2cbd7d..c5c08c87e130 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent_start,
- const struct btrfs_disk_key *disk_key,
- int level,
- u64 hint,
- u64 empty_size,
- enum btrfs_lock_nesting nest)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *ret;
-
- /*
- * If we are COWing a node/leaf from the extent, chunk, device or free
- * space trees, make sure that we do not finish block group creation of
- * pending block groups. We do this to avoid a deadlock.
- * COWing can result in allocation of a new chunk, and flushing pending
- * block groups (btrfs_create_pending_block_groups()) can be triggered
- * when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk, device and free space trees,
- * therefore we could deadlock with ourselves since we are holding a
- * lock on an extent buffer that btrfs_create_pending_block_groups() may
- * try to COW later.
- * For similar reasons, we also need to delay flushing pending block
- * groups when splitting a leaf or node, from one of those trees, since
- * we are holding a write lock on it and its parent or when inserting a
- * new root node for one of those trees.
- */
- if (root == fs_info->extent_root ||
- root == fs_info->chunk_root ||
- root == fs_info->dev_root ||
- root == fs_info->free_space_root)
- trans->can_flush_pending_bgs = false;
-
- ret = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, disk_key, level,
- hint, empty_size, nest);
- trans->can_flush_pending_bgs = true;
-
- return ret;
-}
-
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size, nest);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0, BTRFS_NESTING_SPLIT);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0,
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3381,10 +3340,10 @@ again:
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0, num_doubles ?
- BTRFS_NESTING_NEW_ROOT :
- BTRFS_NESTING_SPLIT);
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0,
+ num_doubles ? BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 06bc842ecdb3..ca848b183474 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -974,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(fs_info, record);
+ btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
@@ -1069,7 +1069,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b117dd3b8172..a59ab7b9aea0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -209,7 +209,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
+ const int num_pages = num_extent_pages(buf);
const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d296483d148f..268ce58d4569 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6019,6 +6019,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
mutex_lock(&fs_info->fs_devices->device_list_mutex);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 28a05ba47060..ee34497500e1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* Copy data from userspace to the current page
*/
- copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+ copied = copy_page_from_iter_atomic(page, offset, count, i);
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -413,20 +413,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
* The rest of the btrfs_file_write code will fall
* back to page at a time copies after we return 0.
*/
- if (!PageUptodate(page) && copied < count)
- copied = 0;
+ if (unlikely(copied < count)) {
+ if (!PageUptodate(page)) {
+ iov_iter_revert(i, copied);
+ copied = 0;
+ }
+ if (!copied)
+ break;
+ }
- iov_iter_advance(i, copied);
write_bytes -= copied;
total_copied += copied;
-
- /* Return to btrfs_file_write_iter to fault page */
- if (unlikely(copied == 0))
- break;
-
- if (copied < PAGE_SIZE - offset) {
- offset += copied;
- } else {
+ offset += copied;
+ if (offset == PAGE_SIZE) {
pg++;
offset = 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6eb20987351..06f9f167222b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2271,13 +2271,127 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
+/*
+ * Split an extent_map at [start, start + len]
+ *
+ * This function is intended to be used only for extract_ordered_extent().
+ */
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ u64 pre, u64 post)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ struct extent_map *split_pre = NULL;
+ struct extent_map *split_mid = NULL;
+ struct extent_map *split_post = NULL;
+ int ret = 0;
+ int modified;
+ unsigned long flags;
+
+ /* Sanity check */
+ if (pre == 0 && post == 0)
+ return 0;
+
+ split_pre = alloc_extent_map();
+ if (pre)
+ split_mid = alloc_extent_map();
+ if (post)
+ split_post = alloc_extent_map();
+ if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ASSERT(pre + post < len);
+
+ lock_extent(&inode->io_tree, start, start + len - 1);
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ASSERT(em->len == len);
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+
+ /* First, replace the em with a new extent_map starting from * em->start */
+ split_pre->start = em->start;
+ split_pre->len = (pre ? pre : em->len - post);
+ split_pre->orig_start = split_pre->start;
+ split_pre->block_start = em->block_start;
+ split_pre->block_len = split_pre->len;
+ split_pre->orig_block_len = split_pre->block_len;
+ split_pre->ram_bytes = split_pre->len;
+ split_pre->flags = flags;
+ split_pre->compress_type = em->compress_type;
+ split_pre->generation = em->generation;
+
+ replace_extent_mapping(em_tree, em, split_pre, modified);
+
+ /*
+ * Now we only have an extent_map at:
+ * [em->start, em->start + pre] if pre != 0
+ * [em->start, em->start + em->len - post] if pre == 0
+ */
+
+ if (pre) {
+ /* Insert the middle extent_map */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre - post;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, modified);
+ }
+
+ if (post) {
+ split_post->start = em->start + em->len - post;
+ split_post->len = post;
+ split_post->orig_start = split_post->start;
+ split_post->block_start = em->block_start + em->len - post;
+ split_post->block_len = split_post->len;
+ split_post->orig_block_len = split_post->block_len;
+ split_post->ram_bytes = split_post->len;
+ split_post->flags = flags;
+ split_post->compress_type = em->compress_type;
+ split_post->generation = em->generation;
+ add_extent_mapping(em_tree, split_post, modified);
+ }
+
+ /* Once for us */
+ free_extent_map(em);
+ /* Once for the tree */
+ free_extent_map(em);
+
+out_unlock:
+ write_unlock(&em_tree->lock);
+ unlock_extent(&inode->io_tree, start, start + len - 1);
+out:
+ free_extent_map(split_pre);
+ free_extent_map(split_mid);
+ free_extent_map(split_post);
+
+ return ret;
+}
+
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
struct btrfs_ordered_extent *ordered;
- struct extent_map *em = NULL, *em_new = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 file_len;
u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
@@ -2317,41 +2431,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
goto out;
}
+ file_len = ordered->num_bytes;
pre = start - ordered->disk_bytenr;
post = ordered_end - end;
ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
- if (!em) {
- read_unlock(&em_tree->lock);
- ret = -EIO;
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
- /*
- * We cannot reuse em_new here but have to create a new one, as
- * unpin_extent_cache() expects the start of the extent map to be the
- * logical offset of the file, which does not hold true anymore after
- * splitting.
- */
- em_new = create_io_em(inode, em->start + pre, len,
- em->start + pre, em->block_start + pre, len,
- len, len, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_REGULAR);
- if (IS_ERR(em_new)) {
- ret = PTR_ERR(em_new);
- goto out;
- }
- free_extent_map(em_new);
+ ret = split_zoned_em(inode, file_offset, file_len, pre, post);
out:
- free_extent_map(em);
btrfs_put_ordered_extent(ordered);
return errno_to_blk_status(ret);
@@ -2903,7 +2992,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->disk)
+ if (ordered_extent->bdev)
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_free_io_failure_record(inode, start, end);
@@ -9137,8 +9226,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
bool dest_log_pinned = false;
bool need_abort = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
/* close the race window with snapshot create/destroy ioctl */
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6eb41b7c0c84..5c0f8481e25e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -190,8 +190,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
- entry->disk = NULL;
- entry->partno = (u8)-1;
ASSERT(type == BTRFS_ORDERED_REGULAR ||
type == BTRFS_ORDERED_NOCOW ||
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 566472004edd..b2d88aba8420 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -145,8 +145,7 @@ struct btrfs_ordered_extent {
* command in a workqueue context
*/
u64 physical;
- struct gendisk *disk;
- u8 partno;
+ struct block_device *bdev;
};
/*
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 07ec06d4e972..0fa121171ca1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1704,17 +1704,39 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct ulist *old_root;
u64 bytenr = qrecord->bytenr;
int ret;
- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
+ /*
+ * We are always called in a context where we are already holding a
+ * transaction handle. Often we are called when adding a data delayed
+ * reference from btrfs_truncate_inode_items() (truncating or unlinking),
+ * in which case we will be holding a write lock on extent buffer from a
+ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
+ * acquire fs_info->commit_root_sem, because that is a higher level lock
+ * that must be acquired before locking any extent buffers.
+ *
+ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
+ * but we can't pass it a non-NULL transaction handle, because otherwise
+ * it would not use commit roots and would lock extent buffers, causing
+ * a deadlock if it ends up trying to read lock the same extent buffer
+ * that was previously write locked at btrfs_truncate_inode_items().
+ *
+ * So pass a NULL transaction handle to btrfs_find_all_roots() and
+ * explicitly tell it to not acquire the commit_root_sem - if we are
+ * holding a transaction handle we don't need its protection.
+ */
+ ASSERT(trans != NULL);
+
+ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
+ false, true);
if (ret < 0) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_warn(fs_info,
+ trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -1758,7 +1780,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
}
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -2629,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
- &record->old_roots, false);
+ &record->old_roots, false, false);
if (ret < 0)
goto cleanup;
}
@@ -2645,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -3179,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
num_bytes = found.offset;
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
- &roots, false);
+ &roots, false, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7283e4f549af..880e9df0dac1 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -298,7 +298,7 @@ int btrfs_qgroup_trace_extent_nolock(
* using current root, then we can move all expensive backref walk out of
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
/*
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index f3137285a9e2..98b5aaba46f1 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -224,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
* quota.
*/
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -237,7 +237,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -261,7 +261,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -273,7 +273,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return -EINVAL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -325,7 +325,7 @@ static int test_multiple_refs(struct btrfs_root *root,
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -338,7 +338,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -360,7 +360,7 @@ static int test_multiple_refs(struct btrfs_root *root,
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -373,7 +373,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -401,7 +401,7 @@ static int test_multiple_refs(struct btrfs_root *root,
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -414,7 +414,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 50318231c1a8..14b9fdc8aaa9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -254,23 +254,21 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
}
/*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->chunk_bytes_reserved)
return;
- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
- atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
- cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}
@@ -386,8 +384,6 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
- atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
- init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -701,7 +697,6 @@ again:
h->fs_info = root->fs_info;
h->type = type;
- h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 07d76029f598..ba45065f9451 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -96,13 +96,6 @@ struct btrfs_transaction {
spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;
-
- /*
- * The number of bytes currently reserved, by all transaction handles
- * attached to this transaction, for metadata extents of the chunk tree.
- */
- atomic64_t chunk_bytes_reserved;
- wait_queue_head_t chunk_reserve_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -139,7 +132,7 @@ struct btrfs_trans_handle {
short aborted;
bool adding_csums;
bool allocating_chunk;
- bool can_flush_pending_bgs;
+ bool removing_chunk;
bool reloc_reserved;
bool in_fsync;
struct btrfs_root *root;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cab451d19547..e6430ac9bbe8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->tree_root->log_mutex);
goto out;
}
}
@@ -5526,16 +5526,29 @@ log_extents:
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
/*
- * Don't update last_log_commit if we logged that an inode exists
- * after it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode,
- * then the inode gets evicted after all delalloc was flushed,
- * then we log it exists (due to a rename for example) and then
- * fsync it. This last fsync would do nothing (not logging the
- * extents previously written).
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for two reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
*/
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
}
@@ -6490,8 +6503,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans < trans->transid &&
- (!old_dir || old_dir->logged_trans < trans->transid))
+ if (!inode_logged(trans, inode) &&
+ (!old_dir || !inode_logged(trans, old_dir)))
return;
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 807502cd6510..70f94b75f25a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1078,6 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -1745,19 +1746,14 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
goto out;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to remove dev extent item");
- } else {
+ if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
- }
out:
btrfs_free_path(path);
return ret;
@@ -2942,7 +2938,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u32 cur;
struct btrfs_key key;
- mutex_lock(&fs_info->chunk_mutex);
+ lockdep_assert_held(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2972,7 +2968,6 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
cur += len;
}
}
- mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -3012,6 +3007,29 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
return em;
}
+static int remove_chunk_item(struct btrfs_trans_handle *trans,
+ struct map_lookup *map, u64 chunk_offset)
+{
+ int i;
+
+ /*
+ * Removing chunk items and updating the device items in the chunks btree
+ * requires holding the chunk_mutex.
+ * See the comment at btrfs_chunk_alloc() for the details.
+ */
+ lockdep_assert_held(&trans->fs_info->chunk_mutex);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ int ret;
+
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ if (ret)
+ return ret;
+ }
+
+ return btrfs_free_chunk(trans, chunk_offset);
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -3032,14 +3050,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return PTR_ERR(em);
}
map = em->map_lookup;
- mutex_lock(&fs_info->chunk_mutex);
- check_system_chunk(trans, map->type);
- mutex_unlock(&fs_info->chunk_mutex);
/*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+ * First delete the device extent items from the devices btree.
+ * We take the device_list_mutex to avoid racing with the finishing phase
+ * of a device replace operation. See the comment below before acquiring
+ * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
+ * because that can result in a deadlock when deleting the device extent
+ * items from the devices btree - COWing an extent buffer from the btree
+ * may result in allocating a new metadata chunk, which would attempt to
+ * lock again fs_info->chunk_mutex.
*/
mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
@@ -3061,18 +3081,73 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_update_device(trans, device);
+ /*
+ * We acquire fs_info->chunk_mutex for 2 reasons:
+ *
+ * 1) Just like with the first phase of the chunk allocation, we must
+ * reserve system space, do all chunk btree updates and deletions, and
+ * update the system chunk array in the superblock while holding this
+ * mutex. This is for similar reasons as explained on the comment at
+ * the top of btrfs_chunk_alloc();
+ *
+ * 2) Prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of
+ * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
+ * the device item, which does not exists on the chunk btree.
+ * The finishing phase of device replace acquires both the
+ * device_list_mutex and the chunk_mutex, in that order, so we are
+ * safe by just acquiring the chunk_mutex.
+ */
+ trans->removing_chunk = true;
+ mutex_lock(&fs_info->chunk_mutex);
+
+ check_system_chunk(trans, map->type);
+
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ /*
+ * Normally we should not get -ENOSPC since we reserved space before
+ * through the call to check_system_chunk().
+ *
+ * Despite our system space_info having enough free space, we may not
+ * be able to allocate extents from its block groups, because all have
+ * an incompatible profile, which will force us to allocate a new system
+ * block group with the right profile, or right after we called
+ * check_system_space() above, a scrub turned the only system block group
+ * with enough free space into RO mode.
+ * This is explained with more detail at do_chunk_alloc().
+ *
+ * So if we get -ENOSPC, allocate a new system chunk and retry once.
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
if (ret) {
- mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
}
- }
- mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_free_chunk(trans, chunk_offset);
- if (ret) {
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3087,6 +3162,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
}
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+
+ /*
+ * We are done with chunk btree updates and deletions, so release the
+ * system space we previously reserved (with check_system_chunk()).
+ */
+ btrfs_trans_release_chunk_metadata(trans);
+
ret = btrfs_remove_block_group(trans, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3094,6 +3178,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
out:
+ if (trans->removing_chunk) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+ }
/* once for us */
free_extent_map(em);
return ret;
@@ -4860,13 +4948,12 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
u32 array_size;
u8 *ptr;
- mutex_lock(&fs_info->chunk_mutex);
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
- mutex_unlock(&fs_info->chunk_mutex);
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
- }
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4875,7 +4962,6 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
- mutex_unlock(&fs_info->chunk_mutex);
return 0;
}
@@ -5225,13 +5311,14 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
-static int create_chunk(struct btrfs_trans_handle *trans,
+static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
struct map_lookup *map = NULL;
struct extent_map_tree *em_tree;
+ struct btrfs_block_group *block_group;
struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
@@ -5241,7 +5328,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
if (!map)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
map->num_stripes = ctl->num_stripes;
for (i = 0; i < ctl->ndevs; ++i) {
@@ -5263,7 +5350,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
kfree(map);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
@@ -5279,12 +5366,12 @@ static int create_chunk(struct btrfs_trans_handle *trans,
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
- return ret;
+ return ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
- if (ret)
+ block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+ if (IS_ERR(block_group))
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
@@ -5304,7 +5391,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
- return 0;
+ return block_group;
error_del_extent:
write_lock(&em_tree->lock);
@@ -5316,34 +5403,36 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
- return ret;
+ return block_group;
}
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct btrfs_device_info *devices_info = NULL;
struct alloc_chunk_ctl ctl;
+ struct btrfs_block_group *block_group;
int ret;
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
ASSERT(0);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
if (list_empty(&fs_devices->alloc_list)) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info, "%s: no writable device", __func__);
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
}
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
ASSERT(0);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
@@ -5353,46 +5442,43 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ret = gather_device_info(fs_devices, &ctl, devices_info);
- if (ret < 0)
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
goto out;
+ }
ret = decide_stripe_size(fs_devices, &ctl, devices_info);
- if (ret < 0)
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
goto out;
+ }
- ret = create_chunk(trans, &ctl, devices_info);
+ block_group = create_chunk(trans, &ctl, devices_info);
out:
kfree(devices_info);
- return ret;
+ return block_group;
}
/*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
+ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
*/
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
- struct btrfs_root *chunk_root = fs_info->chunk_root;
- struct btrfs_key key;
struct btrfs_device *device;
- struct btrfs_chunk *chunk;
- struct btrfs_stripe *stripe;
struct extent_map *em;
struct map_lookup *map;
- size_t item_size;
u64 dev_offset;
u64 stripe_size;
- int i = 0;
+ int i;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
@@ -5400,53 +5486,117 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
return PTR_ERR(em);
map = em->map_lookup;
- item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
- chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk) {
- ret = -ENOMEM;
- goto out;
- }
-
/*
* Take the device list mutex to prevent races with the final phase of
* a device replace operation that replaces the device object associated
* with the map's stripes, because the device object's id can change
* at any time during that final phase of the device replace operation
- * (dev-replace.c:btrfs_dev_replace_finishing()).
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * resulting in persisting a device extent item with such ID.
*/
mutex_lock(&fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
- ret = btrfs_update_device(trans, device);
- if (ret)
- break;
ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
dev_offset, stripe_size);
if (ret)
break;
}
- if (ret) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
+ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
+ * chunks.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_key key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_stripe *stripe;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ int i;
+ int ret;
+
+ /*
+ * We take the chunk_mutex for 2 reasons:
+ *
+ * 1) Updates and insertions in the chunk btree must be done while holding
+ * the chunk_mutex, as well as updating the system chunk array in the
+ * superblock. See the comment on top of btrfs_chunk_alloc() for the
+ * details;
+ *
+ * 2) To prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * which would cause a failure when updating the device item, which does
+ * not exists, or persisting a stripe of the chunk item with such ID.
+ * Here we can't use the device_list_mutex because our caller already
+ * has locked the chunk_mutex, and the final phase of device replace
+ * acquires both mutexes - first the device_list_mutex and then the
+ * chunk_mutex. Using any of those two mutexes protects us from a
+ * concurrent device replace.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ map = em->map_lookup;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+
+ chunk = kzalloc(item_size, GFP_NOFS);
+ if (!chunk) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+
+ ret = btrfs_update_device(trans, device);
+ if (ret)
+ goto out;
+ }
+
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_set_stack_chunk_length(chunk, chunk_size);
+ btrfs_set_stack_chunk_length(chunk, bg->length);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
@@ -5458,15 +5608,18 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.type = BTRFS_CHUNK_ITEM_KEY;
- key.offset = chunk_offset;
+ key.offset = bg->start;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
- if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- /*
- * TODO: Cleanup of inserted chunk root in case of
- * failure.
- */
+ if (ret)
+ goto out;
+
+ bg->chunk_item_inserted = 1;
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+ if (ret)
+ goto out;
}
out:
@@ -5479,16 +5632,41 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
- int ret;
+ struct btrfs_block_group *meta_bg;
+ struct btrfs_block_group *sys_bg;
+
+ /*
+ * When adding a new device for sprouting, the seed device is read-only
+ * so we must first allocate a metadata and a system chunk. But before
+ * adding the block group items to the extent, device and chunk btrees,
+ * we must first:
+ *
+ * 1) Create both chunks without doing any changes to the btrees, as
+ * otherwise we would get -ENOSPC since the block groups from the
+ * seed device are read-only;
+ *
+ * 2) Add the device item for the new sprout device - finishing the setup
+ * of a new block group requires updating the device item in the chunk
+ * btree, so it must exist when we attempt to do it. The previous step
+ * ensures this does not fail with -ENOSPC.
+ *
+ * After that we can add the block group items to their btrees:
+ * update existing device item in the chunk btree, add a new block group
+ * item to the extent btree, add a new chunk item to the chunk btree and
+ * finally add the new device extent items to the devices btree.
+ */
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- ret = btrfs_alloc_chunk(trans, alloc_profile);
- if (ret)
- return ret;
+ meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ if (IS_ERR(meta_bg))
+ return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- ret = btrfs_alloc_chunk(trans, alloc_profile);
- return ret;
+ sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ if (IS_ERR(sys_bg))
+ return PTR_ERR(sys_bg);
+
+ return 0;
}
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
@@ -7415,10 +7593,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
+
+ /*
+ * We are only called at mount time, so no need to take
+ * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
+ * we always lock first fs_info->chunk_mutex before
+ * acquiring any locks on the chunk tree. This is a
+ * requirement for chunk allocation, see the comment on
+ * top of btrfs_chunk_alloc() for details.
+ */
+ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
- mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
- mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7fc7caf575c..55a8ba244716 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -450,7 +450,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num);
@@ -509,6 +510,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size);
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 297c0b1c0634..907c2cc45c9c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1349,8 +1349,7 @@ void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
return;
ordered->physical = physical;
- ordered->disk = bio->bi_bdev->bd_disk;
- ordered->partno = bio->bi_bdev->bd_partno;
+ ordered->bdev = bio->bi_bdev;
btrfs_put_ordered_extent(ordered);
}
@@ -1362,18 +1361,16 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_ordered_sum *sum;
- struct block_device *bdev;
u64 orig_logical = ordered->disk_bytenr;
u64 *logical = NULL;
int nr, stripe_len;
/* Zoned devices should not have partitions. So, we can assume it is 0 */
- ASSERT(ordered->partno == 0);
- bdev = bdgrab(ordered->disk->part0);
- if (WARN_ON(!bdev))
+ ASSERT(!bdev_is_partition(ordered->bdev));
+ if (WARN_ON(!ordered->bdev))
return;
- if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
ordered->physical, &logical, &nr,
&stripe_len)))
goto out;
@@ -1402,7 +1399,6 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
out:
kfree(logical);
- bdput(bdev);
}
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c1570fada3d8..a1e2813731d1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -82,10 +82,6 @@ static int ceph_set_page_dirty(struct page *page)
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- int ret;
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
if (PageDirty(page)) {
dout("%p set_page_dirty %p idx %lu -- already dirty\n",
@@ -130,11 +126,7 @@ static int ceph_set_page_dirty(struct page *page)
BUG_ON(PagePrivate(page));
attach_page_private(page, snapc);
- ret = __set_page_dirty_nobuffers(page);
- WARN_ON(!PageLocked(page));
- WARN_ON(!page->mapping);
-
- return ret;
+ return __set_page_dirty_nobuffers(page);
}
/*
@@ -226,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
int err = req->r_result;
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, err);
+ req->r_end_latency, osd_data->length, err);
dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
subreq->len, i_size_read(req->r_inode));
@@ -313,7 +305,7 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
ceph_put_cap_refs(ci, got);
}
-const struct netfs_read_request_ops ceph_netfs_read_ops = {
+static const struct netfs_read_request_ops ceph_netfs_read_ops = {
.init_rreq = ceph_init_rreq,
.is_cache_enabled = ceph_is_cache_enabled,
.begin_cache_operation = ceph_begin_cache_operation,
@@ -560,7 +552,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
err = ceph_osdc_wait_request(osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, err);
+ req->r_end_latency, len, err);
ceph_osdc_put_request(req);
if (err == 0)
@@ -635,6 +627,7 @@ static void writepages_finish(struct ceph_osd_request *req)
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ unsigned int len = 0;
bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc);
@@ -647,9 +640,6 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_clear_error_write(ci);
}
- ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, rc);
-
/*
* We lost the cache cap, need to truncate the page before
* it is unlocked, otherwise we'd truncate it later in the
@@ -666,6 +656,7 @@ static void writepages_finish(struct ceph_osd_request *req)
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ len += osd_data->length;
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
total_pages += num_pages;
@@ -696,6 +687,9 @@ static void writepages_finish(struct ceph_osd_request *req)
release_pages(osd_data->pages, num_pages);
}
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+
ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -1711,7 +1705,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, err);
+ req->r_end_latency, len, err);
out_put:
ceph_osdc_put_request(req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a5e93b185515..2a2900903f8c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -645,9 +645,7 @@ void ceph_add_cap(struct inode *inode,
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
session->s_mds, cap_id, ceph_cap_string(issued), seq);
- spin_lock(&session->s_gen_ttl_lock);
- gen = session->s_cap_gen;
- spin_unlock(&session->s_gen_ttl_lock);
+ gen = atomic_read(&session->s_cap_gen);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
@@ -785,10 +783,8 @@ static int __cap_is_valid(struct ceph_cap *cap)
unsigned long ttl;
u32 gen;
- spin_lock(&cap->session->s_gen_ttl_lock);
- gen = cap->session->s_cap_gen;
+ gen = atomic_read(&cap->session->s_cap_gen);
ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_gen_ttl_lock);
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
@@ -1182,7 +1178,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* s_cap_gen while session is in the reconnect state.
*/
if (queue_release &&
- (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+ (!session->s_cap_reconnect ||
+ cap->cap_gen == atomic_read(&session->s_cap_gen))) {
cap->queue_release = 1;
if (removed) {
__ceph_queue_cap_release(session, cap);
@@ -1534,7 +1531,7 @@ static inline int __send_flush_snap(struct inode *inode,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Called under i_ceph_lock. Takes s_mutex as needed.
+ * Called under i_ceph_lock.
*/
static void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session *session)
@@ -1656,7 +1653,6 @@ retry:
mds = ci->i_auth_cap->session->s_mds;
if (session && session->s_mds != mds) {
dout(" oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
session = NULL;
}
@@ -1665,10 +1661,6 @@ retry:
mutex_lock(&mdsc->mutex);
session = __ceph_lookup_mds_session(mdsc, mds);
mutex_unlock(&mdsc->mutex);
- if (session) {
- dout(" inverting session/ino locks on %p\n", session);
- mutex_lock(&session->s_mutex);
- }
goto retry;
}
@@ -1680,12 +1672,10 @@ retry:
out:
spin_unlock(&ci->i_ceph_lock);
- if (psession) {
+ if (psession)
*psession = session;
- } else if (session) {
- mutex_unlock(&session->s_mutex);
+ else
ceph_put_mds_session(session);
- }
/* we flushed them all; remove this inode from the queue */
spin_lock(&mdsc->snap_flush_lock);
list_del_init(&ci->i_snap_flush_item);
@@ -1915,7 +1905,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
- int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
@@ -1923,14 +1912,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
bool queue_invalidate = false;
bool tried_invalidate = false;
+ if (session)
+ ceph_get_mds_session(session);
+
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
-
- goto retry_locked;
retry:
- spin_lock(&ci->i_ceph_lock);
-retry_locked:
/* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
@@ -2010,7 +1998,7 @@ retry_locked:
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
tried_invalidate = true;
- goto retry_locked;
+ goto retry;
}
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
@@ -2024,8 +2012,6 @@ retry_locked:
((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
continue;
- /* NOTE: no side-effects allowed, until we take s_mutex */
-
/*
* If we have an auth cap, we don't need to consider any
* overlapping caps as used.
@@ -2088,37 +2074,8 @@ retry_locked:
continue; /* nope, all good */
ack:
- if (session && session != cap->session) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- session = NULL;
- }
- if (!session) {
- session = cap->session;
- if (mutex_trylock(&session->s_mutex) == 0) {
- dout("inverting session/ino locks on %p\n",
- session);
- session = ceph_get_mds_session(session);
- spin_unlock(&ci->i_ceph_lock);
- if (took_snap_rwsem) {
- up_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 0;
- }
- if (session) {
- mutex_lock(&session->s_mutex);
- ceph_put_mds_session(session);
- } else {
- /*
- * Because we take the reference while
- * holding the i_ceph_lock, it should
- * never be NULL. Throw a warning if it
- * ever is.
- */
- WARN_ON_ONCE(true);
- }
- goto retry;
- }
- }
+ ceph_put_mds_session(session);
+ session = ceph_get_mds_session(cap->session);
/* kick flushing and flush snaps before sending normal
* cap message */
@@ -2130,20 +2087,7 @@ ack:
if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
__ceph_flush_snaps(ci, session);
- goto retry_locked;
- }
-
- /* take snap_rwsem after session mutex */
- if (!took_snap_rwsem) {
- if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
- dout("inverting snap/in locks on %p\n",
- inode);
- spin_unlock(&ci->i_ceph_lock);
- down_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 1;
- goto retry;
- }
- took_snap_rwsem = 1;
+ goto retry;
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
@@ -2165,9 +2109,10 @@ ack:
__prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
want, retain, flushing, flush_tid, oldest_flush_tid);
- spin_unlock(&ci->i_ceph_lock);
+ spin_unlock(&ci->i_ceph_lock);
__send_cap(&arg, ci);
+ spin_lock(&ci->i_ceph_lock);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -2182,13 +2127,9 @@ ack:
spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
if (queue_invalidate)
ceph_queue_invalidate(inode);
-
- if (session)
- mutex_unlock(&session->s_mutex);
- if (took_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
}
/*
@@ -2198,26 +2139,17 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_session *session = NULL;
int flushing = 0;
u64 flush_tid = 0, oldest_flush_tid = 0;
-retry:
spin_lock(&ci->i_ceph_lock);
retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
struct cap_msg_args arg;
+ struct ceph_mds_session *session = cap->session;
- if (session != cap->session) {
- spin_unlock(&ci->i_ceph_lock);
- if (session)
- mutex_unlock(&session->s_mutex);
- session = cap->session;
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
+ if (session->s_state < CEPH_MDS_SESSION_OPEN) {
spin_unlock(&ci->i_ceph_lock);
goto out;
}
@@ -2254,9 +2186,6 @@ retry_locked:
spin_unlock(&ci->i_ceph_lock);
}
out:
- if (session)
- mutex_unlock(&session->s_mutex);
-
*ptid = flush_tid;
return flushing;
}
@@ -3213,8 +3142,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (complete_capsnap)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0) {
- /* avoid calling iput_final() in osd dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
}
}
@@ -3288,7 +3216,7 @@ static void handle_cap_grant(struct inode *inode,
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
unsigned char check_caps = 0;
- bool was_stale = cap->cap_gen < session->s_cap_gen;
+ bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen);
bool wake = false;
bool writeback = false;
bool queue_trunc = false;
@@ -3340,7 +3268,7 @@ static void handle_cap_grant(struct inode *inode,
}
/* side effects now are allowed */
- cap->cap_gen = session->s_cap_gen;
+ cap->cap_gen = atomic_read(&session->s_cap_gen);
cap->seq = seq;
__check_cap_issue(ci, cap, newcaps);
@@ -3553,13 +3481,12 @@ static void handle_cap_grant(struct inode *inode,
if (wake)
wake_up_all(&ci->i_cap_wq);
+ mutex_unlock(&session->s_mutex);
if (check_caps == 1)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
session);
else if (check_caps == 2)
ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
- else
- mutex_unlock(&session->s_mutex);
}
/*
@@ -4203,8 +4130,7 @@ done:
mutex_unlock(&session->s_mutex);
done_unlocked:
ceph_put_string(extra_info.pool_ns);
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
return;
flush_cap_releases:
@@ -4224,11 +4150,19 @@ bad:
/*
* Delayed work handler to process end of delayed cap release LRU list.
+ *
+ * If new caps are added to the list while processing it, these won't get
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
+ * returned so that the work can be scheduled accordingly.
*/
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
+ unsigned long loop_start = jiffies;
+ unsigned long delay = 0;
dout("check_delayed_caps\n");
spin_lock(&mdsc->cap_delay_lock);
@@ -4236,6 +4170,11 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
+ dout("%s caps added recently. Exiting loop", __func__);
+ delay = ci->i_hold_caps_max;
+ break;
+ }
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
@@ -4246,12 +4185,13 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
ceph_check_caps(ci, 0, NULL);
- /* avoid calling iput_final() in tick thread */
- ceph_async_iput(inode);
+ iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
}
spin_unlock(&mdsc->cap_delay_lock);
+
+ return delay;
}
/*
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 425f3356332a..38b78b45811f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -127,7 +127,7 @@ static int mdsc_show(struct seq_file *s, void *p)
return 0;
}
-#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \
+#define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \
s64 _total, _avg, _min, _max, _sq, _st; \
_avg = ktime_to_us(avg); \
_min = ktime_to_us(min == KTIME_MAX ? 0 : min); \
@@ -140,6 +140,12 @@ static int mdsc_show(struct seq_file *s, void *p)
name, total, _avg, _min, _max, _st); \
}
+#define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \
+ u64 _min = min == U64_MAX ? 0 : min; \
+ seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \
+ name, total, avg, _min, max, sum); \
+}
+
static int metric_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
@@ -147,6 +153,7 @@ static int metric_show(struct seq_file *s, void *p)
struct ceph_client_metric *m = &mdsc->metric;
int nr_caps = 0;
s64 total, sum, avg, min, max, sq;
+ u64 sum_sz, avg_sz, min_sz, max_sz;
sum = percpu_counter_sum(&m->total_inodes);
seq_printf(s, "item total\n");
@@ -170,7 +177,7 @@ static int metric_show(struct seq_file *s, void *p)
max = m->read_latency_max;
sq = m->read_latency_sq_sum;
spin_unlock(&m->read_metric_lock);
- CEPH_METRIC_SHOW("read", total, avg, min, max, sq);
+ CEPH_LAT_METRIC_SHOW("read", total, avg, min, max, sq);
spin_lock(&m->write_metric_lock);
total = m->total_writes;
@@ -180,7 +187,7 @@ static int metric_show(struct seq_file *s, void *p)
max = m->write_latency_max;
sq = m->write_latency_sq_sum;
spin_unlock(&m->write_metric_lock);
- CEPH_METRIC_SHOW("write", total, avg, min, max, sq);
+ CEPH_LAT_METRIC_SHOW("write", total, avg, min, max, sq);
spin_lock(&m->metadata_metric_lock);
total = m->total_metadatas;
@@ -190,7 +197,29 @@ static int metric_show(struct seq_file *s, void *p)
max = m->metadata_latency_max;
sq = m->metadata_latency_sq_sum;
spin_unlock(&m->metadata_metric_lock);
- CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq);
+ CEPH_LAT_METRIC_SHOW("metadata", total, avg, min, max, sq);
+
+ seq_printf(s, "\n");
+ seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n");
+ seq_printf(s, "----------------------------------------------------------------------------------------\n");
+
+ spin_lock(&m->read_metric_lock);
+ total = m->total_reads;
+ sum_sz = m->read_size_sum;
+ avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0;
+ min_sz = m->read_size_min;
+ max_sz = m->read_size_max;
+ spin_unlock(&m->read_metric_lock);
+ CEPH_SZ_METRIC_SHOW("read", total, avg_sz, min_sz, max_sz, sum_sz);
+
+ spin_lock(&m->write_metric_lock);
+ total = m->total_writes;
+ sum_sz = m->write_size_sum;
+ avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0;
+ min_sz = m->write_size_min;
+ max_sz = m->write_size_max;
+ spin_unlock(&m->write_metric_lock);
+ CEPH_SZ_METRIC_SHOW("write", total, avg_sz, min_sz, max_sz, sum_sz);
seq_printf(s, "\n");
seq_printf(s, "item total miss hit\n");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9ba79b6531fb..133dbd9338e7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -788,6 +788,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask);
+ ihold(dir);
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -868,6 +869,7 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev);
@@ -929,6 +931,8 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
goto out;
}
req->r_parent = dir;
+ ihold(dir);
+
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
@@ -993,6 +997,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
@@ -1037,6 +1042,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -1158,6 +1164,7 @@ retry:
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
@@ -1232,6 +1239,7 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
req->r_old_dentry = dget(old_dentry);
req->r_old_dentry_dir = old_dir;
req->r_parent = new_dir;
+ ihold(new_dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -1548,10 +1556,8 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
u32 gen;
unsigned long ttl;
- spin_lock(&session->s_gen_ttl_lock);
- gen = session->s_cap_gen;
+ gen = atomic_read(&session->s_cap_gen);
ttl = session->s_cap_ttl;
- spin_unlock(&session->s_gen_ttl_lock);
if (di->lease_gen == gen &&
time_before(jiffies, ttl) &&
@@ -1730,6 +1736,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
+ ihold(dir);
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
@@ -1809,8 +1816,7 @@ static void ceph_d_release(struct dentry *dentry)
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
+ ceph_put_mds_session(di->lease_session);
kmem_cache_free(ceph_dentry_cachep, di);
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 65540a4429b2..1d65934c1262 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -542,6 +542,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
ihold(inode);
req->r_ino2 = ceph_vino(d_inode(parent));
req->r_parent = d_inode(parent);
+ ihold(req->r_parent);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d51af3698032..d1755ac1d964 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -706,6 +706,7 @@ retry:
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
req->r_parent = dir;
+ ihold(dir);
if (flags & O_CREAT) {
struct ceph_file_layout lo;
@@ -903,7 +904,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ceph_update_read_metrics(&fsc->mdsc->metric,
req->r_start_latency,
req->r_end_latency,
- ret);
+ len, ret);
ceph_osdc_put_request(req);
@@ -1035,12 +1036,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
+ unsigned int len = osd_data->bvec_pos.iter.bi_size;
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
- dout("ceph_aio_complete_req %p rc %d bytes %u\n",
- inode, rc, osd_data->bvec_pos.iter.bi_size);
+ dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
if (rc == -EOLDSNAPC) {
struct ceph_aio_work *aio_work;
@@ -1058,9 +1059,9 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
} else if (!aio_req->write) {
if (rc == -ENOENT)
rc = 0;
- if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
+ if (rc >= 0 && len > rc) {
struct iov_iter i;
- int zlen = osd_data->bvec_pos.iter.bi_size - rc;
+ int zlen = len - rc;
/*
* If read is satisfied by single OSD request,
@@ -1077,8 +1078,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
- osd_data->num_bvecs,
- osd_data->bvec_pos.iter.bi_size);
+ osd_data->num_bvecs, len);
iov_iter_advance(&i, rc);
iov_iter_zero(zlen, &i);
}
@@ -1088,10 +1088,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
if (req->r_start_latency) {
if (aio_req->write)
ceph_update_write_metrics(metric, req->r_start_latency,
- req->r_end_latency, rc);
+ req->r_end_latency, len, rc);
else
ceph_update_read_metrics(metric, req->r_start_latency,
- req->r_end_latency, rc);
+ req->r_end_latency, len, rc);
}
put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
@@ -1299,10 +1299,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write)
ceph_update_write_metrics(metric, req->r_start_latency,
- req->r_end_latency, ret);
+ req->r_end_latency, len, ret);
else
ceph_update_read_metrics(metric, req->r_start_latency,
- req->r_end_latency, ret);
+ req->r_end_latency, len, ret);
size = i_size_read(inode);
if (!write) {
@@ -1476,7 +1476,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
- req->r_end_latency, ret);
+ req->r_end_latency, len, ret);
out:
ceph_osdc_put_request(req);
if (ret != 0) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index df0c8a724609..1bd2cc015913 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1124,7 +1124,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
return;
}
- if (di->lease_gen == session->s_cap_gen &&
+ if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
time_before(ttl, di->time))
return; /* we already have a newer lease. */
@@ -1135,7 +1135,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
- di->lease_gen = session->s_cap_gen;
+ di->lease_gen = atomic_read(&session->s_cap_gen);
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
@@ -1154,8 +1154,7 @@ static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
__update_dentry_lease(dir, dentry, lease, session, from_time,
&old_lease_session);
spin_unlock(&dentry->d_lock);
- if (old_lease_session)
- ceph_put_mds_session(old_lease_session);
+ ceph_put_mds_session(old_lease_session);
}
/*
@@ -1200,8 +1199,7 @@ static void update_dentry_lease_careful(struct dentry *dentry,
from_time, &old_lease_session);
out_unlock:
spin_unlock(&dentry->d_lock);
- if (old_lease_session)
- ceph_put_mds_session(old_lease_session);
+ ceph_put_mds_session(old_lease_session);
}
/*
@@ -1568,8 +1566,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
unlock_new_inode(in);
}
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(in);
+ iput(in);
}
return err;
@@ -1766,13 +1763,11 @@ retry_lookup:
if (ret < 0) {
pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
- /* avoid calling iput_final() in mds
- * dispatch threads */
if (in->i_state & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- ceph_async_iput(in);
+ iput(in);
}
d_drop(dn);
err = ret;
@@ -1785,7 +1780,7 @@ retry_lookup:
if (ceph_security_xattr_deadlock(in)) {
dout(" skip splicing dn %p to inode %p"
" (security xattr deadlock)\n", dn, in);
- ceph_async_iput(in);
+ iput(in);
skipped++;
goto next_item;
}
@@ -1836,25 +1831,6 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
return ret;
}
-/*
- * Put reference to inode, but avoid calling iput_final() in current thread.
- * iput_final() may wait for reahahead pages. The wait can cause deadlock in
- * some contexts.
- */
-void ceph_async_iput(struct inode *inode)
-{
- if (!inode)
- return;
- for (;;) {
- if (atomic_add_unless(&inode->i_count, -1, 1))
- break;
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work))
- break;
- /* queue work failed, i_count must be at least 2 */
- }
-}
-
void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e5af591d3bd4..afdc20213876 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -664,6 +664,9 @@ struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
void ceph_put_mds_session(struct ceph_mds_session *s)
{
+ if (IS_ERR_OR_NULL(s))
+ return;
+
dout("mdsc put_session %p %d -> %d\n", s,
refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
if (refcount_dec_and_test(&s->s_ref)) {
@@ -746,8 +749,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
- spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 1;
+ atomic_set(&s->s_cap_gen, 1);
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
@@ -822,14 +824,13 @@ void ceph_mdsc_release_request(struct kref *kref)
ceph_msg_put(req->r_reply);
if (req->r_inode) {
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(req->r_inode);
+ iput(req->r_inode);
}
if (req->r_parent) {
ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
- ceph_async_iput(req->r_parent);
+ iput(req->r_parent);
}
- ceph_async_iput(req->r_target_inode);
+ iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
if (req->r_old_dentry)
@@ -843,7 +844,7 @@ void ceph_mdsc_release_request(struct kref *kref)
*/
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- ceph_async_iput(req->r_old_dentry_dir);
+ iput(req->r_old_dentry_dir);
}
kfree(req->r_path1);
kfree(req->r_path2);
@@ -958,8 +959,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
}
if (req->r_unsafe_dir) {
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(req->r_unsafe_dir);
+ iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -1130,7 +1130,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
if (!cap) {
spin_unlock(&ci->i_ceph_lock);
- ceph_async_iput(inode);
+ iput(inode);
goto random;
}
mds = cap->session->s_mds;
@@ -1139,9 +1139,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
out:
- /* avoid calling iput_final() while holding mdsc->mutex or
- * in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
return mds;
random:
@@ -1438,8 +1436,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
for (i = 0; i < mi->num_export_targets; i++) {
ts = __open_export_target_session(mdsc, mi->export_targets[i]);
- if (!IS_ERR(ts))
- ceph_put_mds_session(ts);
+ ceph_put_mds_session(ts);
}
}
@@ -1545,9 +1542,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
spin_unlock(&session->s_cap_lock);
if (last_inode) {
- /* avoid calling iput_final() while holding
- * s_mutex or in mds dispatch threads */
- ceph_async_iput(last_inode);
+ iput(last_inode);
last_inode = NULL;
}
if (old_cap) {
@@ -1581,7 +1576,7 @@ out:
session->s_cap_iterator = NULL;
spin_unlock(&session->s_cap_lock);
- ceph_async_iput(last_inode);
+ iput(last_inode);
if (old_cap)
ceph_put_cap(session->s_mdsc, old_cap);
@@ -1721,8 +1716,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
spin_unlock(&session->s_cap_lock);
inode = ceph_find_inode(sb, vino);
- /* avoid calling iput_final() while holding s_mutex */
- ceph_async_iput(inode);
+ iput(inode);
spin_lock(&session->s_cap_lock);
}
@@ -1761,7 +1755,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
} else if (ev == RENEWCAPS) {
- if (cap->cap_gen < cap->session->s_cap_gen) {
+ if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) {
/* mds did not re-issue stale cap */
spin_lock(&ci->i_ceph_lock);
cap->issued = cap->implemented = CEPH_CAP_PIN;
@@ -2988,7 +2982,6 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
- ihold(req->r_parent);
}
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
@@ -3499,10 +3492,8 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_STALE:
pr_info("mds%d caps went stale, renewing\n",
session->s_mds);
- spin_lock(&session->s_gen_ttl_lock);
- session->s_cap_gen++;
+ atomic_inc(&session->s_cap_gen);
session->s_cap_ttl = jiffies - 1;
- spin_unlock(&session->s_gen_ttl_lock);
send_renew_caps(mdsc, session);
break;
@@ -3771,7 +3762,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
cap->mseq = 0; /* and migrate_seq */
- cap->cap_gen = cap->session->s_cap_gen;
+ cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
/* These are lost when the session goes away */
if (S_ISDIR(inode->i_mode)) {
@@ -4011,9 +4002,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
dout("session %p state %s\n", session,
ceph_session_state_name(session->s_state));
- spin_lock(&session->s_gen_ttl_lock);
- session->s_cap_gen++;
- spin_unlock(&session->s_gen_ttl_lock);
+ atomic_inc(&session->s_cap_gen);
spin_lock(&session->s_cap_lock);
/* don't know if session is readonly */
@@ -4344,7 +4333,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
case CEPH_MDS_LEASE_RENEW:
if (di->lease_session == session &&
- di->lease_gen == session->s_cap_gen &&
+ di->lease_gen == atomic_read(&session->s_cap_gen) &&
di->lease_renew_from &&
di->lease_renew_after == 0) {
unsigned long duration =
@@ -4372,8 +4361,7 @@ release:
out:
mutex_unlock(&session->s_mutex);
- /* avoid calling iput_final() in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
return;
bad:
@@ -4468,7 +4456,7 @@ bool check_session_state(struct ceph_mds_session *s)
break;
case CEPH_MDS_SESSION_CLOSING:
/* Should never reach this when we're unmounting */
- WARN_ON_ONCE(true);
+ WARN_ON_ONCE(s->s_ttl);
fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
@@ -4502,22 +4490,29 @@ void inc_session_sequence(struct ceph_mds_session *s)
}
/*
- * delayed work -- periodically trim expired leases, renew caps with mds
+ * delayed work -- periodically trim expired leases, renew caps with mds. If
+ * the @delay parameter is set to 0 or if it's more than 5 secs, the default
+ * workqueue delay value of 5 secs will be used.
*/
-static void schedule_delayed(struct ceph_mds_client *mdsc)
+static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
+ unsigned long max_delay = HZ * 5;
+
+ /* 5 secs default delay */
+ if (!delay || (delay > max_delay))
+ delay = max_delay;
+ schedule_delayed_work(&mdsc->delayed_work,
+ round_jiffies_relative(delay));
}
static void delayed_work(struct work_struct *work)
{
- int i;
struct ceph_mds_client *mdsc =
container_of(work, struct ceph_mds_client, delayed_work.work);
+ unsigned long delay;
int renew_interval;
int renew_caps;
+ int i;
dout("mdsc delayed_work\n");
@@ -4557,7 +4552,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_check_delayed_caps(mdsc);
+ delay = ceph_check_delayed_caps(mdsc);
ceph_queue_cap_reclaim_work(mdsc);
@@ -4565,7 +4560,7 @@ static void delayed_work(struct work_struct *work)
maybe_recover_session(mdsc);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, delay);
}
int ceph_mdsc_init(struct ceph_fs_client *fsc)
@@ -5042,7 +5037,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, 0);
return;
bad_unlock:
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 15c11a0f2caf..20e42d8b66c6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -186,10 +186,8 @@ struct ceph_mds_session {
struct ceph_auth_handshake s_auth;
- /* protected by s_gen_ttl_lock */
- spinlock_t s_gen_ttl_lock;
- u32 s_cap_gen; /* inc each time we get mds stale msg */
- unsigned long s_cap_ttl; /* when session caps expire */
+ atomic_t s_cap_gen; /* inc each time we get mds stale msg */
+ unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 28b6b42ad677..5ac151eb0d49 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -20,8 +20,11 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_opened_files *files;
struct ceph_pinned_icaps *icaps;
struct ceph_opened_inodes *inodes;
+ struct ceph_read_io_size *rsize;
+ struct ceph_write_io_size *wsize;
struct ceph_client_metric *m = &mdsc->metric;
u64 nr_caps = atomic64_read(&m->total_caps);
+ u32 header_len = sizeof(struct ceph_metric_header);
struct ceph_msg *msg;
struct timespec64 ts;
s64 sum;
@@ -30,7 +33,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+ sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
- + sizeof(*icaps) + sizeof(*inodes);
+ + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize)
+ + sizeof(*wsize);
msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
if (!msg) {
@@ -43,10 +47,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the cap metric */
cap = (struct ceph_metric_cap *)(head + 1);
- cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
- cap->ver = 1;
- cap->compat = 1;
- cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+ cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->header.ver = 1;
+ cap->header.compat = 1;
+ cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len);
cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit));
cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis));
cap->total = cpu_to_le64(nr_caps);
@@ -54,10 +58,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the read latency metric */
read = (struct ceph_metric_read_latency *)(cap + 1);
- read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
- read->ver = 1;
- read->compat = 1;
- read->data_len = cpu_to_le32(sizeof(*read) - 10);
+ read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->header.ver = 1;
+ read->header.compat = 1;
+ read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
sum = m->read_latency_sum;
jiffies_to_timespec64(sum, &ts);
read->sec = cpu_to_le32(ts.tv_sec);
@@ -66,10 +70,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the write latency metric */
write = (struct ceph_metric_write_latency *)(read + 1);
- write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
- write->ver = 1;
- write->compat = 1;
- write->data_len = cpu_to_le32(sizeof(*write) - 10);
+ write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->header.ver = 1;
+ write->header.compat = 1;
+ write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
sum = m->write_latency_sum;
jiffies_to_timespec64(sum, &ts);
write->sec = cpu_to_le32(ts.tv_sec);
@@ -78,10 +82,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the metadata latency metric */
meta = (struct ceph_metric_metadata_latency *)(write + 1);
- meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
- meta->ver = 1;
- meta->compat = 1;
- meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+ meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->header.ver = 1;
+ meta->header.compat = 1;
+ meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
sum = m->metadata_latency_sum;
jiffies_to_timespec64(sum, &ts);
meta->sec = cpu_to_le32(ts.tv_sec);
@@ -90,10 +94,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the dentry lease metric */
dlease = (struct ceph_metric_dlease *)(meta + 1);
- dlease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
- dlease->ver = 1;
- dlease->compat = 1;
- dlease->data_len = cpu_to_le32(sizeof(*dlease) - 10);
+ dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+ dlease->header.ver = 1;
+ dlease->header.compat = 1;
+ dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len);
dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit));
dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis));
dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
@@ -103,34 +107,54 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the opened files metric */
files = (struct ceph_opened_files *)(dlease + 1);
- files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
- files->ver = 1;
- files->compat = 1;
- files->data_len = cpu_to_le32(sizeof(*files) - 10);
+ files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
+ files->header.ver = 1;
+ files->header.compat = 1;
+ files->header.data_len = cpu_to_le32(sizeof(*files) - header_len);
files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
files->total = cpu_to_le64(sum);
items++;
/* encode the pinned icaps metric */
icaps = (struct ceph_pinned_icaps *)(files + 1);
- icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
- icaps->ver = 1;
- icaps->compat = 1;
- icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10);
+ icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
+ icaps->header.ver = 1;
+ icaps->header.compat = 1;
+ icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len);
icaps->pinned_icaps = cpu_to_le64(nr_caps);
icaps->total = cpu_to_le64(sum);
items++;
/* encode the opened inodes metric */
inodes = (struct ceph_opened_inodes *)(icaps + 1);
- inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
- inodes->ver = 1;
- inodes->compat = 1;
- inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10);
+ inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
+ inodes->header.ver = 1;
+ inodes->header.compat = 1;
+ inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len);
inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
inodes->total = cpu_to_le64(sum);
items++;
+ /* encode the read io size metric */
+ rsize = (struct ceph_read_io_size *)(inodes + 1);
+ rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES);
+ rsize->header.ver = 1;
+ rsize->header.compat = 1;
+ rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len);
+ rsize->total_ops = cpu_to_le64(m->total_reads);
+ rsize->total_size = cpu_to_le64(m->read_size_sum);
+ items++;
+
+ /* encode the write io size metric */
+ wsize = (struct ceph_write_io_size *)(rsize + 1);
+ wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES);
+ wsize->header.ver = 1;
+ wsize->header.compat = 1;
+ wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len);
+ wsize->total_ops = cpu_to_le64(m->total_writes);
+ wsize->total_size = cpu_to_le64(m->write_size_sum);
+ items++;
+
put_unaligned_le32(items, &head->num);
msg->front.iov_len = len;
msg->hdr.version = cpu_to_le16(1);
@@ -225,6 +249,9 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->read_latency_max = 0;
m->total_reads = 0;
m->read_latency_sum = 0;
+ m->read_size_min = U64_MAX;
+ m->read_size_max = 0;
+ m->read_size_sum = 0;
spin_lock_init(&m->write_metric_lock);
m->write_latency_sq_sum = 0;
@@ -232,6 +259,9 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->write_latency_max = 0;
m->total_writes = 0;
m->write_latency_sum = 0;
+ m->write_size_min = U64_MAX;
+ m->write_size_max = 0;
+ m->write_size_sum = 0;
spin_lock_init(&m->metadata_metric_lock);
m->metadata_latency_sq_sum = 0;
@@ -281,23 +311,21 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
cancel_delayed_work_sync(&m->delayed_work);
- if (m->session)
- ceph_put_mds_session(m->session);
+ ceph_put_mds_session(m->session);
}
-static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
- ktime_t *min, ktime_t *max,
- ktime_t *sq_sump, ktime_t lat)
-{
- ktime_t total, avg, sq, lsum;
-
- total = ++(*totalp);
- lsum = (*lsump += lat);
+#define METRIC_UPDATE_MIN_MAX(min, max, new) \
+{ \
+ if (unlikely(new < min)) \
+ min = new; \
+ if (unlikely(new > max)) \
+ max = new; \
+}
- if (unlikely(lat < *min))
- *min = lat;
- if (unlikely(lat > *max))
- *max = lat;
+static inline void __update_stdev(ktime_t total, ktime_t lsum,
+ ktime_t *sq_sump, ktime_t lat)
+{
+ ktime_t avg, sq;
if (unlikely(total == 1))
return;
@@ -312,33 +340,51 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
void ceph_update_read_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
- int rc)
+ unsigned int size, int rc)
{
ktime_t lat = ktime_sub(r_end, r_start);
+ ktime_t total;
if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
return;
spin_lock(&m->read_metric_lock);
- __update_latency(&m->total_reads, &m->read_latency_sum,
- &m->read_latency_min, &m->read_latency_max,
- &m->read_latency_sq_sum, lat);
+ total = ++m->total_reads;
+ m->read_size_sum += size;
+ m->read_latency_sum += lat;
+ METRIC_UPDATE_MIN_MAX(m->read_size_min,
+ m->read_size_max,
+ size);
+ METRIC_UPDATE_MIN_MAX(m->read_latency_min,
+ m->read_latency_max,
+ lat);
+ __update_stdev(total, m->read_latency_sum,
+ &m->read_latency_sq_sum, lat);
spin_unlock(&m->read_metric_lock);
}
void ceph_update_write_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
- int rc)
+ unsigned int size, int rc)
{
ktime_t lat = ktime_sub(r_end, r_start);
+ ktime_t total;
if (unlikely(rc && rc != -ETIMEDOUT))
return;
spin_lock(&m->write_metric_lock);
- __update_latency(&m->total_writes, &m->write_latency_sum,
- &m->write_latency_min, &m->write_latency_max,
- &m->write_latency_sq_sum, lat);
+ total = ++m->total_writes;
+ m->write_size_sum += size;
+ m->write_latency_sum += lat;
+ METRIC_UPDATE_MIN_MAX(m->write_size_min,
+ m->write_size_max,
+ size);
+ METRIC_UPDATE_MIN_MAX(m->write_latency_min,
+ m->write_latency_max,
+ lat);
+ __update_stdev(total, m->write_latency_sum,
+ &m->write_latency_sq_sum, lat);
spin_unlock(&m->write_metric_lock);
}
@@ -347,13 +393,18 @@ void ceph_update_metadata_metrics(struct ceph_client_metric *m,
int rc)
{
ktime_t lat = ktime_sub(r_end, r_start);
+ ktime_t total;
if (unlikely(rc && rc != -ENOENT))
return;
spin_lock(&m->metadata_metric_lock);
- __update_latency(&m->total_metadatas, &m->metadata_latency_sum,
- &m->metadata_latency_min, &m->metadata_latency_max,
- &m->metadata_latency_sq_sum, lat);
+ total = ++m->total_metadatas;
+ m->metadata_latency_sum += lat;
+ METRIC_UPDATE_MIN_MAX(m->metadata_latency_min,
+ m->metadata_latency_max,
+ lat);
+ __update_stdev(total, m->metadata_latency_sum,
+ &m->metadata_latency_sq_sum, lat);
spin_unlock(&m->metadata_metric_lock);
}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index e984eb2bb14b..0133955a3c6a 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -17,8 +17,10 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_FILES,
CLIENT_METRIC_TYPE_PINNED_ICAPS,
CLIENT_METRIC_TYPE_OPENED_INODES,
+ CLIENT_METRIC_TYPE_READ_IO_SIZES,
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
- CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES,
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
};
/*
@@ -34,18 +36,22 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_FILES, \
CLIENT_METRIC_TYPE_PINNED_ICAPS, \
CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
\
CLIENT_METRIC_TYPE_MAX, \
}
-/* metric caps header */
-struct ceph_metric_cap {
+struct ceph_metric_header {
__le32 type; /* ceph metric type */
-
__u8 ver;
__u8 compat;
-
__le32 data_len; /* length of sizeof(hit + mis + total) */
+} __packed;
+
+/* metric caps header */
+struct ceph_metric_cap {
+ struct ceph_metric_header header;
__le64 hit;
__le64 mis;
__le64 total;
@@ -53,48 +59,28 @@ struct ceph_metric_cap {
/* metric read latency header */
struct ceph_metric_read_latency {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(sec + nsec) */
+ struct ceph_metric_header header;
__le32 sec;
__le32 nsec;
} __packed;
/* metric write latency header */
struct ceph_metric_write_latency {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(sec + nsec) */
+ struct ceph_metric_header header;
__le32 sec;
__le32 nsec;
} __packed;
/* metric metadata latency header */
struct ceph_metric_metadata_latency {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(sec + nsec) */
+ struct ceph_metric_header header;
__le32 sec;
__le32 nsec;
} __packed;
/* metric dentry lease header */
struct ceph_metric_dlease {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(hit + mis + total) */
+ struct ceph_metric_header header;
__le64 hit;
__le64 mis;
__le64 total;
@@ -102,40 +88,39 @@ struct ceph_metric_dlease {
/* metric opened files header */
struct ceph_opened_files {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(opened_files + total) */
+ struct ceph_metric_header header;
__le64 opened_files;
__le64 total;
} __packed;
/* metric pinned i_caps header */
struct ceph_pinned_icaps {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(pinned_icaps + total) */
+ struct ceph_metric_header header;
__le64 pinned_icaps;
__le64 total;
} __packed;
/* metric opened inodes header */
struct ceph_opened_inodes {
- __le32 type; /* ceph metric type */
-
- __u8 ver;
- __u8 compat;
-
- __le32 data_len; /* length of sizeof(opened_inodes + total) */
+ struct ceph_metric_header header;
__le64 opened_inodes;
__le64 total;
} __packed;
+/* metric read io size header */
+struct ceph_read_io_size {
+ struct ceph_metric_header header;
+ __le64 total_ops;
+ __le64 total_size;
+} __packed;
+
+/* metric write io size header */
+struct ceph_write_io_size {
+ struct ceph_metric_header header;
+ __le64 total_ops;
+ __le64 total_size;
+} __packed;
+
struct ceph_metric_head {
__le32 num; /* the number of metrics that will be sent */
} __packed;
@@ -152,6 +137,9 @@ struct ceph_client_metric {
spinlock_t read_metric_lock;
u64 total_reads;
+ u64 read_size_sum;
+ u64 read_size_min;
+ u64 read_size_max;
ktime_t read_latency_sum;
ktime_t read_latency_sq_sum;
ktime_t read_latency_min;
@@ -159,6 +147,9 @@ struct ceph_client_metric {
spinlock_t write_metric_lock;
u64 total_writes;
+ u64 write_size_sum;
+ u64 write_size_min;
+ u64 write_size_max;
ktime_t write_latency_sum;
ktime_t write_latency_sq_sum;
ktime_t write_latency_min;
@@ -206,10 +197,10 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
extern void ceph_update_read_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
- int rc);
+ unsigned int size, int rc);
extern void ceph_update_write_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
- int rc);
+ unsigned int size, int rc);
extern void ceph_update_metadata_metrics(struct ceph_client_metric *m,
ktime_t r_start, ktime_t r_end,
int rc);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 4e32c9600ecc..620c691af40e 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -74,8 +74,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
le64_to_cpu(h->max_files));
spin_unlock(&ci->i_ceph_lock);
- /* avoid calling iput_final() in dispatch thread */
- ceph_async_iput(inode);
+ iput(inode);
}
static struct ceph_quotarealm_inode *
@@ -247,8 +246,7 @@ restart:
ci = ceph_inode(in);
has_quota = __ceph_has_any_quota(ci);
- /* avoid calling iput_final() while holding mdsc->snap_rwsem */
- ceph_async_iput(in);
+ iput(in);
next = realm->parent;
if (has_quota || !next)
@@ -383,8 +381,7 @@ restart:
pr_warn("Invalid quota check op (%d)\n", op);
exceeded = true; /* Just break the loop */
}
- /* avoid calling iput_final() while holding mdsc->snap_rwsem */
- ceph_async_iput(in);
+ iput(in);
next = realm->parent;
if (exceeded || !next)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4ce18055d931..4c6bd1042c94 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -60,24 +60,26 @@
/*
* increase ref count for the realm
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
/*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
+ * atomically with the refcount change. Go ahead and bump the
+ * nref here, unless it's 0, in which case we take the spinlock
+ * and then do the increment and remove it from the list.
*/
- if (atomic_inc_return(&realm->nref) == 1) {
- spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_not_zero(&realm->nref))
+ return;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
+ spin_unlock(&mdsc->snap_empty_lock);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -113,6 +115,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
realm = kzalloc(sizeof(*realm), GFP_NOFS);
if (!realm)
return ERR_PTR(-ENOMEM);
@@ -135,7 +139,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
/*
* lookup the realm rooted at @ino.
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino)
@@ -143,6 +147,8 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
while (n) {
r = rb_entry(n, struct ceph_snap_realm, node);
if (ino < r->ino)
@@ -176,6 +182,8 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
@@ -198,28 +206,30 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ /*
+ * We do not require the snap_empty_lock here, as any caller that
+ * increments the value must hold the snap_rwsem.
+ */
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
- * caller needn't hold any locks
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
- spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
@@ -236,6 +246,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
spin_lock(&mdsc->snap_empty_lock);
while (!list_empty(&mdsc->snap_empty)) {
realm = list_first_entry(&mdsc->snap_empty,
@@ -269,6 +281,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
{
struct ceph_snap_realm *parent;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
if (realm->parent_ino == parentino)
return 0;
@@ -460,7 +474,7 @@ static bool has_new_snaps(struct ceph_snap_context *o,
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
@@ -663,15 +677,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
- /* avoid calling iput_final() while holding
- * mdsc->snap_rwsem or in mds dispatch threads */
- ceph_async_iput(lastinode);
+ iput(lastinode);
lastinode = inode;
ceph_queue_cap_snap(ci);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
- ceph_async_iput(lastinode);
+ iput(lastinode);
dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
}
@@ -696,6 +708,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
dout("update_snap_trace deletion=%d\n", deletion);
more:
ceph_decode_need(&p, e, sizeof(*ri), bad);
@@ -791,7 +805,7 @@ more:
return 0;
bad:
- err = -EINVAL;
+ err = -EIO;
fail:
if (realm && !IS_ERR(realm))
ceph_put_snap_realm(mdsc, realm);
@@ -823,17 +837,12 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
ceph_flush_snaps(ci, &session);
- /* avoid calling iput_final() while holding
- * session->s_mutex or in mds dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
spin_unlock(&mdsc->snap_flush_lock);
- if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
+ ceph_put_mds_session(session);
dout("flush_snaps done\n");
}
@@ -969,14 +978,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
ceph_get_snap_realm(mdsc, realm);
ceph_put_snap_realm(mdsc, oldrealm);
- /* avoid calling iput_final() while holding
- * mdsc->snap_rwsem or mds in dispatch threads */
- ceph_async_iput(inode);
+ iput(inode);
continue;
skip_inode:
spin_unlock(&ci->i_ceph_lock);
- ceph_async_iput(inode);
+ iput(inode);
}
/* we may have taken some of the old realm's children. */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 839e6b0239ee..9215a2f4535c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -931,7 +931,6 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
@@ -989,8 +988,6 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void ceph_async_iput(struct inode *inode);
-
void ceph_queue_inode_work(struct inode *inode, int work_bit);
static inline void ceph_queue_vmtruncate(struct inode *inode)
@@ -1170,7 +1167,7 @@ extern void ceph_flush_snaps(struct ceph_inode_info *ci,
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec57cdb1590f..007427ba75e5 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -151,6 +151,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
return ERR_PTR(-EINVAL);
if (ref) {
+ if (WARN_ON_ONCE(!ref->node_name || ref->path_consumed < 0))
+ return ERR_PTR(-EINVAL);
+
if (strlen(fullpath) - ref->path_consumed) {
prepath = fullpath + ref->path_consumed;
/* skip initial delimiter */
@@ -173,7 +176,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
}
}
- rc = dns_resolve_server_name_to_ip(name, &srvIP);
+ rc = dns_resolve_server_name_to_ip(name, &srvIP, NULL);
if (rc < 0) {
cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
__func__, name, rc);
@@ -208,6 +211,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
else
noff = tkn_e - (sb_mountdata + off) + 1;
+ if (strncasecmp(sb_mountdata + off, "cruid=", 6) == 0) {
+ off += noff;
+ continue;
+ }
if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
off += noff;
continue;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9fb874dd8d24..64b71c4e2a9d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -65,6 +65,7 @@ bool lookupCacheEnabled = true;
bool disable_legacy_dialects; /* false by default */
bool enable_gcm_256 = true;
bool require_gcm_256; /* false by default */
+bool enable_negotiate_signing; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
@@ -104,6 +105,9 @@ MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encr
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
+module_param(enable_negotiate_signing, bool, 0644);
+MODULE_PARM_DESC(enable_negotiate_signing, "Enable negotiating packet signing algorithm with server. Default: n/N/0");
+
module_param(disable_legacy_dialects, bool, 0644);
MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
"helpful to restrict the ability to "
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 177f3e7ab86d..d25a4099b32e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -153,5 +153,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.32"
+#define CIFS_VERSION "2.33"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 3100f8b66e60..c6a9542ca281 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -75,6 +75,9 @@
#define SMB_ECHO_INTERVAL_MAX 600
#define SMB_ECHO_INTERVAL_DEFAULT 60
+/* dns resolution interval in seconds */
+#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
+
/* maximum number of PDUs in one compound */
#define MAX_COMPOUND 5
@@ -577,6 +580,7 @@ struct TCP_Server_Info {
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
struct smb_version_operations *ops;
struct smb_version_values *vals;
+ /* updates to tcpStatus protected by GlobalMid_Lock */
enum statusEnum tcpStatus; /* what we think the status is */
char *hostname; /* hostname portion of UNC string */
struct socket *ssocket;
@@ -645,6 +649,7 @@ struct TCP_Server_Info {
/* point to the SMBD connection if RDMA is used instead of socket */
struct smbd_connection *smbd_conn;
struct delayed_work echo; /* echo ping workqueue job */
+ struct delayed_work resolve; /* dns resolution workqueue job */
char *smallbuf; /* pointer to current "small" buffer */
char *bigbuf; /* pointer to current "big" buffer */
/* Total size of this PDU. Only valid from cifs_demultiplex_thread */
@@ -666,9 +671,11 @@ struct TCP_Server_Info {
unsigned int max_write;
unsigned int min_offload;
__le16 compress_algorithm;
+ __u16 signing_algorithm;
__le16 cipher_type;
/* save initital negprot hash */
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+ bool signing_negotiated; /* true if valid signing context rcvd from server */
bool posix_ext_supported;
struct delayed_work reconnect; /* reconnect workqueue job */
struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
@@ -686,6 +693,9 @@ struct TCP_Server_Info {
bool use_swn_dstaddr;
struct sockaddr_storage swn_dstaddr;
#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ bool is_dfs_conn; /* if a dfs connection */
+#endif
};
struct cifs_credits {
@@ -1601,6 +1611,11 @@ struct dfs_info3_param {
int ttl;
};
+struct file_list {
+ struct list_head list;
+ struct cifsFileInfo *cfile;
+};
+
/*
* common struct for holding inode info when searching for or updating an
* inode with new info
@@ -1785,7 +1800,7 @@ require use of the stronger protocol */
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
* list operations on global DnotifyReqList
- * updates to ses->status
+ * updates to ses->status and TCP_Server_Info->tcpStatus
* updates to server->CurrentMid
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
@@ -1868,6 +1883,7 @@ extern unsigned int global_secflags; /* if on, session setup sent
extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
extern bool enable_gcm_256; /* allow optional negotiate of strongest signing (aes-gcm-256) */
extern bool require_gcm_256; /* require use of strongest signing (aes-gcm-256) */
+extern bool enable_negotiate_signing; /* request use of faster (GMAC) signing if available */
extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
extern unsigned int CIFSMaxBufSize; /* max size not including hdr */
extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 0923f72d27e9..f6e235001358 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1785,6 +1785,7 @@ struct smb_com_transaction2_sfi_req {
__u16 Fid;
__le16 InformationLevel;
__u16 Reserved4;
+ __u8 payload[];
} __attribute__((packed));
struct smb_com_transaction2_sfi_rsp {
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 58ebec4d4413..65d1a65bfc37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -873,8 +873,11 @@ PsxDelete:
InformationLevel) - 4;
offset = param_offset + params;
- /* Setup pointer to Request Data (inode type) */
- pRqD = (struct unlink_psx_rq *)(((char *)&pSMB->hdr.Protocol) + offset);
+ /* Setup pointer to Request Data (inode type).
+ * Note that SMB offsets are from the beginning of SMB which is 4 bytes
+ * in, after RFC1001 field
+ */
+ pRqD = (struct unlink_psx_rq *)((char *)(pSMB) + offset + 4);
pRqD->type = cpu_to_le16(type);
pSMB->ParameterOffset = cpu_to_le16(param_offset);
pSMB->DataOffset = cpu_to_le16(offset);
@@ -1081,7 +1084,8 @@ PsxCreat:
param_offset = offsetof(struct smb_com_transaction2_spi_req,
InformationLevel) - 4;
offset = param_offset + params;
- pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ pdata = (OPEN_PSX_REQ *)((char *)(pSMB) + offset + 4);
pdata->Level = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
pdata->Permissions = cpu_to_le64(mode);
pdata->PosixOpenFlags = cpu_to_le32(posix_flags);
@@ -2537,8 +2541,9 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->TotalDataCount = pSMB->DataCount;
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->ParameterOffset = cpu_to_le16(param_offset);
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
parm_data = (struct cifs_posix_lock *)
- (((char *) &pSMB->hdr.Protocol) + offset);
+ (((char *)pSMB) + offset + 4);
parm_data->lock_type = cpu_to_le16(lock_type);
if (waitFlag) {
@@ -2767,7 +2772,8 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
offset = param_offset + params;
- data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ data_offset = (char *)(pSMB) + offset + 4;
rename_info = (struct set_file_rename *) data_offset;
pSMB->MaxParameterCount = cpu_to_le16(2);
pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
@@ -2925,7 +2931,8 @@ createSymLinkRetry:
InformationLevel) - 4;
offset = param_offset + params;
- data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ data_offset = (char *)pSMB + offset + 4;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
cifsConvertToUTF16((__le16 *) data_offset, toName,
@@ -3009,7 +3016,8 @@ createHardLinkRetry:
InformationLevel) - 4;
offset = param_offset + params;
- data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ data_offset = (char *)pSMB + offset + 4;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
cifsConvertToUTF16((__le16 *) data_offset, fromName,
@@ -5626,9 +5634,9 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->TotalDataCount = pSMB->DataCount;
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->ParameterOffset = cpu_to_le16(param_offset);
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
parm_data =
- (struct file_end_of_file_info *) (((char *) &pSMB->hdr.Protocol)
- + offset);
+ (struct file_end_of_file_info *)(((char *)pSMB) + offset + 4);
pSMB->DataOffset = cpu_to_le16(offset);
parm_data->FileSize = cpu_to_le64(size);
pSMB->Fid = cfile->fid.netfid;
@@ -5761,7 +5769,8 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
offset = param_offset + params;
- data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ data_offset = (char *)(pSMB) + offset + 4;
count = 1;
pSMB->MaxParameterCount = cpu_to_le16(2);
@@ -6062,9 +6071,8 @@ setPermsRetry:
param_offset = offsetof(struct smb_com_transaction2_spi_req,
InformationLevel) - 4;
offset = param_offset + params;
- data_offset =
- (FILE_UNIX_BASIC_INFO *) ((char *) &pSMB->hdr.Protocol +
- offset);
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ data_offset = (FILE_UNIX_BASIC_INFO *)((char *) pSMB + offset + 4);
memset(data_offset, 0, count);
pSMB->DataOffset = cpu_to_le16(offset);
pSMB->ParameterOffset = cpu_to_le16(param_offset);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5d269f583dac..3781eee9360a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -78,6 +78,8 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
int rc;
int len;
char *unc, *ipaddr = NULL;
+ time64_t expiry, now;
+ unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT;
if (!server->hostname)
return -EINVAL;
@@ -91,13 +93,13 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
}
scnprintf(unc, len, "\\\\%s", server->hostname);
- rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
+ rc = dns_resolve_server_name_to_ip(unc, &ipaddr, &expiry);
kfree(unc);
if (rc < 0) {
cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
__func__, server->hostname, rc);
- return rc;
+ goto requeue_resolve;
}
spin_lock(&cifs_tcp_ses_lock);
@@ -106,7 +108,45 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
kfree(ipaddr);
- return !rc ? -1 : 0;
+ /* rc == 1 means success here */
+ if (rc) {
+ now = ktime_get_real_seconds();
+ if (expiry && expiry > now)
+ /*
+ * To make sure we don't use the cached entry, retry 1s
+ * after expiry.
+ */
+ ttl = (expiry - now + 1);
+ }
+ rc = !rc ? -1 : 0;
+
+requeue_resolve:
+ cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n",
+ __func__, ttl);
+ mod_delayed_work(cifsiod_wq, &server->resolve, (ttl * HZ));
+
+ return rc;
+}
+
+
+static void cifs_resolve_server(struct work_struct *work)
+{
+ int rc;
+ struct TCP_Server_Info *server = container_of(work,
+ struct TCP_Server_Info, resolve.work);
+
+ mutex_lock(&server->srv_mutex);
+
+ /*
+ * Resolve the hostname again to make sure that IP address is up-to-date.
+ */
+ rc = reconn_set_ipaddr_from_hostname(server);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+ __func__, rc);
+ }
+
+ mutex_unlock(&server->srv_mutex);
}
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -180,7 +220,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
#ifdef CONFIG_CIFS_DFS_UPCALL
struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
- struct dfs_cache_tgt_list tgt_list = {0};
+ struct dfs_cache_tgt_list tgt_list = DFS_CACHE_TGT_LIST_INIT(tgt_list);
struct dfs_cache_tgt_iterator *tgt_it = NULL;
#endif
@@ -680,6 +720,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
cancel_delayed_work_sync(&server->echo);
+ cancel_delayed_work_sync(&server->resolve);
spin_lock(&GlobalMid_Lock);
server->tcpStatus = CifsExiting;
@@ -1227,6 +1268,16 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ /*
+ * DFS failover implementation in cifs_reconnect() requires unique tcp sessions for
+ * DFS connections to do failover properly, so avoid sharing them with regular
+ * shares or even links that may connect to same server but having completely
+ * different failover targets.
+ */
+ if (server->is_dfs_conn)
+ continue;
+#endif
/*
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
@@ -1254,12 +1305,16 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
return;
}
+ /* srv_count can never go negative */
+ WARN_ON(server->srv_count < 0);
+
put_net(cifs_net_ns(server));
list_del_init(&server->tcp_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
cancel_delayed_work_sync(&server->echo);
+ cancel_delayed_work_sync(&server->resolve);
if (from_reconnect)
/*
@@ -1342,6 +1397,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+ INIT_DELAYED_WORK(&tcp_ses->resolve, cifs_resolve_server);
INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
mutex_init(&tcp_ses->reconnect_mutex);
memcpy(&tcp_ses->srcaddr, &ctx->srcaddr,
@@ -1403,6 +1459,11 @@ smbd_connected:
goto out_err_crypto_release;
}
tcp_ses->min_offload = ctx->min_offload;
+ /*
+ * at this point we are the only ones with the pointer
+ * to the struct since the kernel thread not created yet
+ * no need to spinlock this update of tcpStatus
+ */
tcp_ses->tcpStatus = CifsNeedNegotiate;
if ((ctx->max_credits < 20) || (ctx->max_credits > 60000))
@@ -1422,6 +1483,12 @@ smbd_connected:
/* queue echo request delayed work */
queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
+ /* queue dns resolution delayed work */
+ cifs_dbg(FYI, "%s: next dns resolution scheduled for %d seconds in the future\n",
+ __func__, SMB_DNS_RESOLVE_INTERVAL_DEFAULT);
+
+ queue_delayed_work(cifsiod_wq, &tcp_ses->resolve, (SMB_DNS_RESOLVE_INTERVAL_DEFAULT * HZ));
+
return tcp_ses;
out_err_crypto_release:
@@ -1600,6 +1667,9 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
}
spin_unlock(&cifs_tcp_ses_lock);
+ /* ses_count can never go negative */
+ WARN_ON(ses->ses_count < 0);
+
spin_lock(&GlobalMid_Lock);
if (ses->status == CifsGood)
ses->status = CifsExiting;
@@ -1967,6 +2037,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
return;
}
+ /* tc_count can never go negative */
+ WARN_ON(tcon->tc_count < 0);
+
if (tcon->use_witness) {
int rc;
@@ -2905,6 +2978,23 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
}
#ifdef CONFIG_CIFS_DFS_UPCALL
+static int mount_get_dfs_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb,
+ unsigned int *xid, struct TCP_Server_Info **nserver,
+ struct cifs_ses **nses, struct cifs_tcon **ntcon)
+{
+ int rc;
+
+ ctx->nosharesock = true;
+ rc = mount_get_conns(ctx, cifs_sb, xid, nserver, nses, ntcon);
+ if (*nserver) {
+ cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__);
+ spin_lock(&cifs_tcp_ses_lock);
+ (*nserver)->is_dfs_conn = true;
+ spin_unlock(&cifs_tcp_ses_lock);
+ }
+ return rc;
+}
+
/*
* cifs_build_path_to_root returns full path to root when we do not have an
* existing connection (tcon)
@@ -3040,7 +3130,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
{
int rc;
char *npath = NULL;
- struct dfs_cache_tgt_list tgt_list = {0};
+ struct dfs_cache_tgt_list tgt_list = DFS_CACHE_TGT_LIST_INIT(tgt_list);
struct dfs_cache_tgt_iterator *tgt_it = NULL;
struct smb3_fs_context tmp_ctx = {NULL};
@@ -3100,7 +3190,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
tmp_ctx.prepath);
mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
- rc = mount_get_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon);
+ rc = mount_get_dfs_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon);
if (!rc || (*server && *ses)) {
/*
* We were able to connect to new target server. Update current context with
@@ -3399,7 +3489,12 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
goto error;
}
- ctx->nosharesock = true;
+ mount_put_conns(cifs_sb, xid, server, ses, tcon);
+ /*
+ * Ignore error check here because we may failover to other targets from cached a
+ * referral.
+ */
+ (void)mount_get_dfs_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
/* Get path of DFS root */
ref_path = build_unc_path_to_root(ctx, cifs_sb, false);
@@ -3428,7 +3523,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
/* Connect to new DFS target only if we were redirected */
if (oldmnt != cifs_sb->ctx->mount_options) {
mount_put_conns(cifs_sb, xid, server, ses, tcon);
- rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
+ rc = mount_get_dfs_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
}
if (rc && !server && !ses) {
/* Failed to connect. Try to connect to other targets in the referral. */
@@ -3454,7 +3549,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
rc = -ELOOP;
} while (rc == -EREMOTE);
- if (rc || !tcon)
+ if (rc || !tcon || !ses)
goto error;
kfree(ref_path);
@@ -4090,7 +4185,8 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
if (!tree)
return -ENOMEM;
- if (!tcon->dfs_path) {
+ /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
+ if (!tcon->dfs_path || dfs_cache_noreq_find(tcon->dfs_path + 1, &ref, &tl)) {
if (tcon->ipc) {
scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
@@ -4100,9 +4196,6 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
goto out;
}
- rc = dfs_cache_noreq_find(tcon->dfs_path + 1, &ref, &tl);
- if (rc)
- goto out;
isroot = ref.server_type == DFS_TYPE_ROOT;
free_dfs_info_param(&ref);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 7c1769714609..283745592844 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -19,6 +19,7 @@
#include "cifs_debug.h"
#include "cifs_unicode.h"
#include "smb2glob.h"
+#include "dns_resolve.h"
#include "dfs_cache.h"
@@ -911,6 +912,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl)
err_free_it:
list_for_each_entry_safe(it, nit, head, it_list) {
+ list_del(&it->it_list);
kfree(it->it_name);
kfree(it);
}
@@ -1293,6 +1295,194 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
return 0;
}
+static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, const char *s2)
+{
+ char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
+ const char *host;
+ size_t hostlen;
+ char *ip = NULL;
+ struct sockaddr sa;
+ bool match;
+ int rc;
+
+ if (strcasecmp(s1, s2))
+ return false;
+
+ /*
+ * Resolve share's hostname and check if server address matches. Otherwise just ignore it
+ * as we could not have upcall to resolve hostname or failed to convert ip address.
+ */
+ match = true;
+ extract_unc_hostname(s1, &host, &hostlen);
+ scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host);
+
+ rc = dns_resolve_server_name_to_ip(unc, &ip, NULL);
+ if (rc < 0) {
+ cifs_dbg(FYI, "%s: could not resolve %.*s. assuming server address matches.\n",
+ __func__, (int)hostlen, host);
+ return true;
+ }
+
+ if (!cifs_convert_address(&sa, ip, strlen(ip))) {
+ cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n",
+ __func__, ip);
+ } else {
+ mutex_lock(&server->srv_mutex);
+ match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa);
+ mutex_unlock(&server->srv_mutex);
+ }
+
+ kfree(ip);
+ return match;
+}
+
+/*
+ * Mark dfs tcon for reconnecting when the currently connected tcon does not match any of the new
+ * target shares in @refs.
+ */
+static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cache_tgt_list *tl,
+ const struct dfs_info3_param *refs, int numrefs)
+{
+ struct dfs_cache_tgt_iterator *it;
+ int i;
+
+ for (it = dfs_cache_get_tgt_iterator(tl); it; it = dfs_cache_get_next_tgt(tl, it)) {
+ for (i = 0; i < numrefs; i++) {
+ if (target_share_equal(tcon->ses->server, dfs_cache_get_tgt_name(it),
+ refs[i].node_name))
+ return;
+ }
+ }
+
+ cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
+ for (i = 0; i < tcon->ses->chan_count; i++) {
+ spin_lock(&GlobalMid_Lock);
+ if (tcon->ses->chans[i].server->tcpStatus != CifsExiting)
+ tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&GlobalMid_Lock);
+ }
+}
+
+/* Refresh dfs referral of tcon and mark it for reconnect if needed */
+static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool force_refresh)
+{
+ const char *path = tcon->dfs_path + 1;
+ struct cifs_ses *ses;
+ struct cache_entry *ce;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+ bool needs_refresh = false;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ int rc = 0;
+ unsigned int xid;
+
+ ses = find_ipc_from_server_path(sessions, path);
+ if (IS_ERR(ses)) {
+ cifs_dbg(FYI, "%s: could not find ipc session\n", __func__);
+ return PTR_ERR(ses);
+ }
+
+ down_read(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
+ if (!IS_ERR(ce)) {
+ rc = get_targets(ce, &tl);
+ if (rc)
+ cifs_dbg(FYI, "%s: could not get dfs targets: %d\n", __func__, rc);
+ }
+ up_read(&htable_rw_lock);
+
+ if (!needs_refresh) {
+ rc = 0;
+ goto out;
+ }
+
+ xid = get_xid();
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
+ free_xid(xid);
+
+ /* Create or update a cache entry with the new referral */
+ if (!rc) {
+ dump_refs(refs, numrefs);
+
+ down_write(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ if (IS_ERR(ce))
+ add_cache_entry_locked(refs, numrefs);
+ else if (force_refresh || cache_entry_expired(ce))
+ update_cache_entry_locked(ce, refs, numrefs);
+ up_write(&htable_rw_lock);
+
+ mark_for_reconnect_if_needed(tcon, &tl, refs, numrefs);
+ }
+
+out:
+ dfs_cache_free_tgts(&tl);
+ free_dfs_info_array(refs, numrefs);
+ return rc;
+}
+
+/**
+ * dfs_cache_remount_fs - remount a DFS share
+ *
+ * Reconfigure dfs mount by forcing a new DFS referral and if the currently cached targets do not
+ * match any of the new targets, mark it for reconnect.
+ *
+ * @cifs_sb: cifs superblock.
+ *
+ * Return zero if remounted, otherwise non-zero.
+ */
+int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
+{
+ struct cifs_tcon *tcon;
+ struct mount_group *mg;
+ struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
+ int rc;
+
+ if (!cifs_sb || !cifs_sb->master_tlink)
+ return -EINVAL;
+
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (!tcon->dfs_path) {
+ cifs_dbg(FYI, "%s: not a dfs tcon\n", __func__);
+ return 0;
+ }
+
+ if (uuid_is_null(&cifs_sb->dfs_mount_id)) {
+ cifs_dbg(FYI, "%s: tcon has no dfs mount group id\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&mount_group_list_lock);
+ mg = find_mount_group_locked(&cifs_sb->dfs_mount_id);
+ if (IS_ERR(mg)) {
+ mutex_unlock(&mount_group_list_lock);
+ cifs_dbg(FYI, "%s: tcon has ipc session to refresh referral\n", __func__);
+ return PTR_ERR(mg);
+ }
+ kref_get(&mg->refcount);
+ mutex_unlock(&mount_group_list_lock);
+
+ spin_lock(&mg->lock);
+ memcpy(&sessions, mg->sessions, mg->num_sessions * sizeof(mg->sessions[0]));
+ spin_unlock(&mg->lock);
+
+ /*
+ * After reconnecting to a different server, unique ids won't match anymore, so we disable
+ * serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
+ */
+ cifs_autodisable_serverino(cifs_sb);
+ /*
+ * Force the use of prefix path to support failover on DFS paths that resolve to targets
+ * that have different prefix paths.
+ */
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = refresh_tcon(sessions, tcon, true);
+
+ kref_put(&mg->refcount, mount_group_release);
+ return rc;
+}
+
/*
* Refresh all active dfs mounts regardless of whether they are in cache or not.
* (cache can be cleared)
@@ -1303,7 +1493,6 @@ static void refresh_mounts(struct cifs_ses **sessions)
struct cifs_ses *ses;
struct cifs_tcon *tcon, *ntcon;
struct list_head tcons;
- unsigned int xid;
INIT_LIST_HEAD(&tcons);
@@ -1321,44 +1510,8 @@ static void refresh_mounts(struct cifs_ses **sessions)
spin_unlock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
- const char *path = tcon->dfs_path + 1;
- struct cache_entry *ce;
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
- bool needs_refresh = false;
- int rc = 0;
-
list_del_init(&tcon->ulist);
-
- ses = find_ipc_from_server_path(sessions, path);
- if (IS_ERR(ses))
- goto next_tcon;
-
- down_read(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- needs_refresh = IS_ERR(ce) || cache_entry_expired(ce);
- up_read(&htable_rw_lock);
-
- if (!needs_refresh)
- goto next_tcon;
-
- xid = get_xid();
- rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- free_xid(xid);
-
- /* Create or update a cache entry with the new referral */
- if (!rc) {
- down_write(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- if (IS_ERR(ce))
- add_cache_entry_locked(refs, numrefs);
- else if (cache_entry_expired(ce))
- update_cache_entry_locked(ce, refs, numrefs);
- up_write(&htable_rw_lock);
- }
-
-next_tcon:
- free_dfs_info_array(refs, numrefs);
+ refresh_tcon(sessions, tcon, false);
cifs_put_tcon(tcon);
}
}
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index b29d3ae64829..52070d1df189 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -13,6 +13,8 @@
#include <linux/uuid.h>
#include "cifsglob.h"
+#define DFS_CACHE_TGT_LIST_INIT(var) { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), }
+
struct dfs_cache_tgt_list {
int tl_numtgts;
struct list_head tl_list;
@@ -44,6 +46,7 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id);
void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses);
char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap);
+int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb);
static inline struct dfs_cache_tgt_iterator *
dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 79402ca0ddfa..5f8a302ffcb2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -100,7 +100,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0;
- s = dentry_path_raw(direntry, page, PAGE_SIZE);
+ s = dentry_path_raw(direntry, page, PATH_MAX);
if (IS_ERR(s))
return s;
if (!s[1]) // for root we want "", not "/"
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index d15b82d569ef..8c616aaeb7c4 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -24,6 +24,7 @@
* dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
* @unc: UNC path specifying the server (with '/' as delimiter)
* @ip_addr: Where to return the IP address.
+ * @expiry: Where to return the expiry time for the dns record.
*
* The IP address will be returned in string form, and the caller is
* responsible for freeing it.
@@ -31,7 +32,7 @@
* Returns length of result on success, -ve on error.
*/
int
-dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
+dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry)
{
struct sockaddr_storage ss;
const char *hostname, *sep;
@@ -66,13 +67,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
/* Perform the upcall */
rc = dns_query(current->nsproxy->net_ns, NULL, hostname, len,
- NULL, ip_addr, NULL, false);
+ NULL, ip_addr, expiry, false);
if (rc < 0)
cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
__func__, len, len, hostname);
else
- cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n",
- __func__, len, len, hostname, *ip_addr);
+ cifs_dbg(FYI, "%s: resolved: %*.*s to %s expiry %llu\n",
+ __func__, len, len, hostname, *ip_addr,
+ expiry ? (*expiry) : 0);
return rc;
name_is_IP_address:
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 5be060b82b13..9fa2807ef79e 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -12,7 +12,7 @@
#define _DNS_RESOLVE_H
#ifdef __KERNEL__
-extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
+extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry);
#endif /* KERNEL */
#endif /* _DNS_RESOLVE_H */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd108607a070..bb98fbdd22a9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4619,7 +4619,7 @@ read_complete:
static int cifs_readpage(struct file *file, struct page *page)
{
- loff_t offset = (loff_t)page->index << PAGE_SHIFT;
+ loff_t offset = page_file_offset(page);
int rc = -EACCES;
unsigned int xid;
@@ -4848,34 +4848,33 @@ void cifs_oplock_break(struct work_struct *work)
oplock_break_ack:
/*
- * releasing stale oplock after recent reconnect of smb session using
- * a now incorrect file handle is not a data integrity issue but do
- * not bother sending an oplock release if session to server still is
- * disconnected since oplock already released by the server
- */
- if (!cfile->oplock_break_cancelled) {
- rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
- cinode);
- cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
- }
- /*
* When oplock break is received and there are no active
* file handles but cached, then schedule deferred close immediately.
* So, new open will not use cached handle.
*/
spin_lock(&CIFS_I(inode)->deferred_lock);
is_deferred = cifs_is_deferred_close(cfile, &dclose);
+ spin_unlock(&CIFS_I(inode)->deferred_lock);
if (is_deferred &&
cfile->deferred_close_scheduled &&
delayed_work_pending(&cfile->deferred)) {
- /*
- * If there is no pending work, mod_delayed_work queues new work.
- * So, Increase the ref count to avoid use-after-free.
- */
- if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
- cifsFileInfo_get(cfile);
+ if (cancel_delayed_work(&cfile->deferred)) {
+ _cifsFileInfo_put(cfile, false, false);
+ goto oplock_break_done;
+ }
}
- spin_unlock(&CIFS_I(inode)->deferred_lock);
+ /*
+ * releasing stale oplock after recent reconnect of smb session using
+ * a now incorrect file handle is not a data integrity issue but do
+ * not bother sending an oplock release if session to server still is
+ * disconnected since oplock already released by the server
+ */
+ if (!cfile->oplock_break_cancelled) {
+ rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
+ cinode);
+ cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
+ }
+oplock_break_done:
_cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 92d4ab029c91..eed59bc1d913 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -13,6 +13,9 @@
#include <linux/magic.h>
#include <linux/security.h>
#include <net/net_namespace.h>
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
*/
#include <linux/ctype.h>
@@ -322,7 +325,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->UNC = NULL;
new_ctx->source = NULL;
new_ctx->iocharset = NULL;
-
/*
* Make sure to stay in sync with smb3_cleanup_fs_context_contents()
*/
@@ -780,6 +782,10 @@ static int smb3_reconfigure(struct fs_context *fc)
smb3_cleanup_fs_context_contents(cifs_sb->ctx);
rc = smb3_fs_context_dup(cifs_sb->ctx, ctx);
smb3_update_mnt_flags(cifs_sb);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (!rc)
+ rc = dfs_cache_remount_fs(cifs_sb);
+#endif
return rc;
}
@@ -792,6 +798,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int i, opt;
bool is_smb3 = !strcmp(fc->fs_type->name, "smb3");
bool skip_parsing = false;
+ kuid_t uid;
+ kgid_t gid;
cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key);
@@ -904,18 +912,38 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
break;
case Opt_uid:
- ctx->linux_uid.val = result.uint_32;
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto cifs_parse_mount_err;
+ ctx->linux_uid = uid;
ctx->uid_specified = true;
break;
case Opt_cruid:
- ctx->cred_uid.val = result.uint_32;
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto cifs_parse_mount_err;
+ ctx->cred_uid = uid;
+ ctx->cruid_specified = true;
+ break;
+ case Opt_backupuid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto cifs_parse_mount_err;
+ ctx->backupuid = uid;
+ ctx->backupuid_specified = true;
break;
case Opt_backupgid:
- ctx->backupgid.val = result.uint_32;
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto cifs_parse_mount_err;
+ ctx->backupgid = gid;
ctx->backupgid_specified = true;
break;
case Opt_gid:
- ctx->linux_gid.val = result.uint_32;
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto cifs_parse_mount_err;
+ ctx->linux_gid = gid;
ctx->gid_specified = true;
break;
case Opt_port:
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index 2a71c8e411ac..b6243972edf3 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -155,6 +155,7 @@ enum cifs_param {
struct smb3_fs_context {
bool uid_specified;
+ bool cruid_specified;
bool gid_specified;
bool sloppy;
bool got_ip;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b96b253e7635..65f8a70cece3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1625,7 +1625,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
goto unlink_out;
}
- cifs_close_all_deferred_files(tcon);
+ cifs_close_deferred_file(CIFS_I(inode));
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -2084,6 +2084,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
FILE_UNIX_BASIC_INFO *info_buf_target;
unsigned int xid;
int rc, tmprc;
+ int retry_count = 0;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -2113,10 +2114,24 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
goto cifs_rename_exit;
}
- cifs_close_all_deferred_files(tcon);
+ cifs_close_deferred_file(CIFS_I(d_inode(source_dentry)));
+ if (d_inode(target_dentry) != NULL)
+ cifs_close_deferred_file(CIFS_I(d_inode(target_dentry)));
+
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
to_name);
+ if (rc == -EACCES) {
+ while (retry_count < 3) {
+ cifs_close_all_deferred_files(tcon);
+ rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
+ to_name);
+ if (rc != -EACCES)
+ break;
+ retry_count++;
+ }
+ }
+
/*
* No-replace is the natural behavior for CIFS, so skip unlink hacks.
*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 184138b4eb8c..9469f1cf0b46 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -723,13 +723,31 @@ void
cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *cfile = NULL;
- struct cifs_deferred_close *dclose;
+ struct file_list *tmp_list, *tmp_next_list;
+ struct list_head file_head;
+
+ if (cifs_inode == NULL)
+ return;
+ INIT_LIST_HEAD(&file_head);
+ spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
- spin_lock(&cifs_inode->deferred_lock);
- if (cifs_is_deferred_close(cfile, &dclose))
- mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
- spin_unlock(&cifs_inode->deferred_lock);
+ if (delayed_work_pending(&cfile->deferred)) {
+ if (cancel_delayed_work(&cfile->deferred)) {
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ continue;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
+ }
+ }
+ }
+ spin_unlock(&cifs_inode->open_file_lock);
+
+ list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
+ _cifsFileInfo_put(tmp_list->cfile, true, false);
+ list_del(&tmp_list->list);
+ kfree(tmp_list);
}
}
@@ -738,20 +756,30 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
struct list_head *tmp;
+ struct file_list *tmp_list, *tmp_next_list;
+ struct list_head file_head;
+ INIT_LIST_HEAD(&file_head);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp, &tcon->openFileList) {
cfile = list_entry(tmp, struct cifsFileInfo, tlist);
if (delayed_work_pending(&cfile->deferred)) {
- /*
- * If there is no pending work, mod_delayed_work queues new work.
- * So, Increase the ref count to avoid use-after-free.
- */
- if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
- cifsFileInfo_get(cfile);
+ if (cancel_delayed_work(&cfile->deferred)) {
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ continue;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
+ }
}
}
spin_unlock(&tcon->open_file_lock);
+
+ list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
+ _cifsFileInfo_put(tmp_list->cfile, true, false);
+ list_del(&tmp_list->list);
+ kfree(tmp_list);
+ }
}
/* parses DFS refferal V3 structure
@@ -1187,7 +1215,7 @@ int match_target_ip(struct TCP_Server_Info *server,
cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2);
- rc = dns_resolve_server_name_to_ip(target, &tip);
+ rc = dns_resolve_server_name_to_ip(target, &tip, NULL);
if (rc < 0)
goto out;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index e4c8f603dd58..2dfd0d8297eb 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -557,8 +557,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
p = buf;
while (bytes_left >= sizeof(*p)) {
info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE);
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE);
+ info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
@@ -2910,6 +2910,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
/* ipc tcons are not refcounted */
spin_lock(&cifs_tcp_ses_lock);
tcon->tc_count--;
+ /* tc_count can never go negative */
+ WARN_ON(tcon->tc_count < 0);
spin_unlock(&cifs_tcp_ses_lock);
}
kfree(utf16_path);
@@ -3616,6 +3618,7 @@ static int smb3_simple_fallocate_write_range(unsigned int xid,
{
struct cifs_io_parms io_parms = {0};
int nbytes;
+ int rc = 0;
struct kvec iov[2];
io_parms.netfid = cfile->fid.netfid;
@@ -3623,13 +3626,25 @@ static int smb3_simple_fallocate_write_range(unsigned int xid,
io_parms.tcon = tcon;
io_parms.persistent_fid = cfile->fid.persistent_fid;
io_parms.volatile_fid = cfile->fid.volatile_fid;
- io_parms.offset = off;
- io_parms.length = len;
- /* iov[0] is reserved for smb header */
- iov[1].iov_base = buf;
- iov[1].iov_len = io_parms.length;
- return SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+ while (len) {
+ io_parms.offset = off;
+ io_parms.length = len;
+ if (io_parms.length > SMB2_MAX_BUFFER_SIZE)
+ io_parms.length = SMB2_MAX_BUFFER_SIZE;
+ /* iov[0] is reserved for smb header */
+ iov[1].iov_base = buf;
+ iov[1].iov_len = io_parms.length;
+ rc = SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+ if (rc)
+ break;
+ if (nbytes > len)
+ return -EINVAL;
+ buf += nbytes;
+ off += nbytes;
+ len -= nbytes;
+ }
+ return rc;
}
static int smb3_simple_fallocate_range(unsigned int xid,
@@ -3653,11 +3668,6 @@ static int smb3_simple_fallocate_range(unsigned int xid,
(char **)&out_data, &out_data_len);
if (rc)
goto out;
- /*
- * It is already all allocated
- */
- if (out_data_len == 0)
- goto out;
buf = kzalloc(1024 * 1024, GFP_KERNEL);
if (buf == NULL) {
@@ -3780,6 +3790,24 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
goto out;
}
+ if (keep_size == true) {
+ /*
+ * We can not preallocate pages beyond the end of the file
+ * in SMB2
+ */
+ if (off >= i_size_read(inode)) {
+ rc = 0;
+ goto out;
+ }
+ /*
+ * For fallocates that are partially beyond the end of file,
+ * clamp len so we only fallocate up to the end of file.
+ */
+ if (off + len > i_size_read(inode)) {
+ len = i_size_read(inode) - off;
+ }
+ }
+
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
/*
* At this point, we are trying to fallocate an internal
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 962826dc3316..b6d2e3591927 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -433,6 +433,29 @@ build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt)
pneg_ctxt->CompressionAlgorithms[2] = SMB3_COMPRESS_LZNT1;
}
+static unsigned int
+build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
+{
+ unsigned int ctxt_len = sizeof(struct smb2_signing_capabilities);
+ unsigned short num_algs = 1; /* number of signing algorithms sent */
+
+ pneg_ctxt->ContextType = SMB2_SIGNING_CAPABILITIES;
+ /*
+ * Context Data length must be rounded to multiple of 8 for some servers
+ */
+ pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
+ sizeof(struct smb2_signing_capabilities) -
+ sizeof(struct smb2_neg_context) +
+ (num_algs * 2 /* sizeof u16 */), 8) * 8);
+ pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
+ pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC);
+
+ ctxt_len += 2 /* sizeof le16 */ * num_algs;
+ ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
+ return ctxt_len;
+ /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */
+}
+
static void
build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
{
@@ -498,7 +521,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
char *pneg_ctxt;
- unsigned int ctxt_len;
+ unsigned int ctxt_len, neg_context_count;
if (*total_len > 200) {
/* In case length corrupted don't want to overrun smb buffer */
@@ -525,6 +548,17 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
+ ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+ server->hostname);
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+
+ build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
+ *total_len += sizeof(struct smb2_posix_neg_context);
+ pneg_ctxt += sizeof(struct smb2_posix_neg_context);
+
+ neg_context_count = 4;
+
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
pneg_ctxt);
@@ -533,17 +567,20 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
8) * 8;
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
- req->NegotiateContextCount = cpu_to_le16(5);
- } else
- req->NegotiateContextCount = cpu_to_le16(4);
+ neg_context_count++;
+ }
- ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
- server->hostname);
- *total_len += ctxt_len;
- pneg_ctxt += ctxt_len;
+ if (enable_negotiate_signing) {
+ ctxt_len = build_signing_ctxt((struct smb2_signing_capabilities *)
+ pneg_ctxt);
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+ neg_context_count++;
+ }
+
+ /* check for and add transport_capabilities and signing capabilities */
+ req->NegotiateContextCount = cpu_to_le16(neg_context_count);
- build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
- *total_len += sizeof(struct smb2_posix_neg_context);
}
static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
@@ -632,6 +669,31 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server,
return 0;
}
+static void decode_signing_ctx(struct TCP_Server_Info *server,
+ struct smb2_signing_capabilities *pctxt)
+{
+ unsigned int len = le16_to_cpu(pctxt->DataLength);
+
+ if ((len < 4) || (len > 16)) {
+ pr_warn_once("server sent bad signing negcontext\n");
+ return;
+ }
+ if (le16_to_cpu(pctxt->SigningAlgorithmCount) != 1) {
+ pr_warn_once("Invalid signing algorithm count\n");
+ return;
+ }
+ if (le16_to_cpu(pctxt->SigningAlgorithms[0]) > 2) {
+ pr_warn_once("unknown signing algorithm\n");
+ return;
+ }
+
+ server->signing_negotiated = true;
+ server->signing_algorithm = le16_to_cpu(pctxt->SigningAlgorithms[0]);
+ cifs_dbg(FYI, "signing algorithm %d chosen\n",
+ server->signing_algorithm);
+}
+
+
static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
struct TCP_Server_Info *server,
unsigned int len_of_smb)
@@ -675,6 +737,9 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
(struct smb2_compression_capabilities_context *)pctx);
else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE)
server->posix_ext_supported = true;
+ else if (pctx->ContextType == SMB2_SIGNING_CAPABILITIES)
+ decode_signing_ctx(server,
+ (struct smb2_signing_capabilities *)pctx);
else
cifs_server_dbg(VFS, "unknown negcontext of type %d ignored\n",
le16_to_cpu(pctx->ContextType));
@@ -2361,7 +2426,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
memcpy(aclptr, &acl, sizeof(struct cifs_acl));
buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
- *len = ptr - (__u8 *)buf;
+ *len = roundup(ptr - (__u8 *)buf, 8);
return buf;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index a5c48b85549a..e9cac7970b66 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -13,7 +13,7 @@
#define _SMB2PDU_H
#include <net/sock.h>
-#include <cifsacl.h>
+#include "cifsacl.h"
/*
* Note that, due to trying to use names similar to the protocol specifications,
@@ -329,7 +329,7 @@ struct smb2_neg_context {
__le16 ContextType;
__le16 DataLength;
__le32 Reserved;
- /* Followed by array of data */
+ /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
} __packed;
#define SMB311_LINUX_CLIENT_SALT_SIZE 32
@@ -394,6 +394,8 @@ struct smb2_compression_capabilities_context {
__u16 Padding;
__u32 Flags;
__le16 CompressionAlgorithms[3];
+ __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */
+ /* Check if pad needed */
} __packed;
/*
@@ -420,6 +422,7 @@ struct smb2_transport_capabilities_context {
__le16 DataLength;
__u32 Reserved;
__le32 Flags;
+ __u32 Pad;
} __packed;
/*
@@ -458,6 +461,7 @@ struct smb2_signing_capabilities {
__u32 Reserved;
__le16 SigningAlgorithmCount;
__le16 SigningAlgorithms[];
+ /* Followed by padding to 8 byte boundary (required by some servers) */
} __packed;
#define POSIX_CTXT_DATA_LEN 16
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f65f9a692ca2..75a95de320cf 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -431,7 +431,9 @@ unmask:
* be taken as the remainder of this one. We need to kill the
* socket so the server throws away the partial SMB
*/
+ spin_lock(&GlobalMid_Lock);
server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&GlobalMid_Lock);
trace_smb3_partial_send_reconnect(server->CurrentMid,
server->conn_id, server->hostname);
}
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index e26060dae70a..0ad32150611e 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -14,7 +14,7 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-
+#include <linux/uio.h>
#include <linux/configfs.h>
#include "configfs_internal.h"
@@ -77,28 +77,9 @@ static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer)
return 0;
}
-/**
- * configfs_read_file - read an attribute.
- * @file: file pointer.
- * @buf: buffer to fill.
- * @count: number of bytes to read.
- * @ppos: starting offset in file.
- *
- * Userspace wants to read an attribute file. The attribute descriptor
- * is in the file's ->d_fsdata. The target item is in the directory's
- * ->d_fsdata.
- *
- * We call fill_read_buffer() to allocate and fill the buffer from the
- * item's show() method exactly once (if the read is happening from
- * the beginning of the file). That should fill the entire buffer with
- * all the data the item has to offer for that attribute.
- * We then call flush_read_buffer() to copy the buffer to userspace
- * in the increments specified.
- */
-
-static ssize_t
-configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t configfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct configfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
@@ -108,43 +89,27 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
if (retval)
goto out;
}
- pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
- __func__, count, *ppos, buffer->page);
- retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
- buffer->count);
+ pr_debug("%s: count = %zd, pos = %lld, buf = %s\n",
+ __func__, iov_iter_count(to), iocb->ki_pos, buffer->page);
+ if (iocb->ki_pos >= buffer->count)
+ goto out;
+ retval = copy_to_iter(buffer->page + iocb->ki_pos,
+ buffer->count - iocb->ki_pos, to);
+ iocb->ki_pos += retval;
+ if (retval == 0)
+ retval = -EFAULT;
out:
mutex_unlock(&buffer->mutex);
return retval;
}
-/**
- * configfs_read_bin_file - read a binary attribute.
- * @file: file pointer.
- * @buf: buffer to fill.
- * @count: number of bytes to read.
- * @ppos: starting offset in file.
- *
- * Userspace wants to read a binary attribute file. The attribute
- * descriptor is in the file's ->d_fsdata. The target item is in the
- * directory's ->d_fsdata.
- *
- * We check whether we need to refill the buffer. If so we will
- * call the attributes' attr->read() twice. The first time we
- * will pass a NULL as a buffer pointer, which the attributes' method
- * will use to return the size of the buffer required. If no error
- * occurs we will allocate the buffer using vmalloc and call
- * attr->read() again passing that buffer as an argument.
- * Then we just copy to user-space using simple_read_from_buffer.
- */
-
-static ssize_t
-configfs_read_bin_file(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t configfs_bin_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct configfs_fragment *frag = to_frag(file);
struct configfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
- ssize_t len = min_t(size_t, count, PAGE_SIZE);
+ ssize_t len;
mutex_lock(&buffer->mutex);
@@ -200,42 +165,35 @@ configfs_read_bin_file(struct file *file, char __user *buf,
buffer->needs_read_fill = 0;
}
- retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer,
- buffer->bin_buffer_size);
+ if (iocb->ki_pos >= buffer->bin_buffer_size)
+ goto out;
+ retval = copy_to_iter(buffer->bin_buffer + iocb->ki_pos,
+ buffer->bin_buffer_size - iocb->ki_pos, to);
+ iocb->ki_pos += retval;
+ if (retval == 0)
+ retval = -EFAULT;
out:
mutex_unlock(&buffer->mutex);
return retval;
}
-
-/**
- * fill_write_buffer - copy buffer from userspace.
- * @buffer: data buffer for file.
- * @buf: data from user.
- * @count: number of bytes in @userbuf.
- *
- * Allocate @buffer->page if it hasn't been already, then
- * copy the user-supplied buffer into it.
- */
-
-static int
-fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+/* Fill @buffer with data coming from @from. */
+static int fill_write_buffer(struct configfs_buffer *buffer,
+ struct iov_iter *from)
{
- int error;
+ int copied;
if (!buffer->page)
buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
if (!buffer->page)
return -ENOMEM;
- if (count >= SIMPLE_ATTR_SIZE)
- count = SIMPLE_ATTR_SIZE - 1;
- error = copy_from_user(buffer->page,buf,count);
+ copied = copy_from_iter(buffer->page, SIMPLE_ATTR_SIZE - 1, from);
buffer->needs_read_fill = 1;
/* if buf is assumed to contain a string, terminate it by \0,
* so e.g. sscanf() can scan the string easily */
- buffer->page[count] = 0;
- return error ? -EFAULT : count;
+ buffer->page[copied] = 0;
+ return copied ? : -EFAULT;
}
static int
@@ -252,58 +210,36 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou
}
-/**
- * configfs_write_file - write an attribute.
- * @file: file pointer
- * @buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Similar to configfs_read_file(), though working in the opposite direction.
- * We allocate and fill the data from the user in fill_write_buffer(),
- * then push it to the config_item in flush_write_buffer().
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come
- * on the first write.
- * Hint: if you're writing a value, first read the file, modify only
- * the value you're changing, then write entire buffer back.
+/*
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on the
+ * first write.
+ * Hint: if you're writing a value, first read the file, modify only the value
+ * you're changing, then write entire buffer back.
*/
-
-static ssize_t
-configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+static ssize_t configfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct configfs_buffer *buffer = file->private_data;
- ssize_t len;
+ int len;
mutex_lock(&buffer->mutex);
- len = fill_write_buffer(buffer, buf, count);
+ len = fill_write_buffer(buffer, from);
if (len > 0)
len = flush_write_buffer(file, buffer, len);
if (len > 0)
- *ppos += len;
+ iocb->ki_pos += len;
mutex_unlock(&buffer->mutex);
return len;
}
-/**
- * configfs_write_bin_file - write a binary attribute.
- * @file: file pointer
- * @buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Writing to a binary attribute file is similar to a normal read.
- * We buffer the consecutive writes (binary attribute files do not
- * support lseek) in a continuously growing buffer, but we don't
- * commit until the close of the file.
- */
-
-static ssize_t
-configfs_write_bin_file(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t configfs_bin_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct configfs_buffer *buffer = file->private_data;
void *tbuf = NULL;
+ size_t end_offset;
ssize_t len;
mutex_lock(&buffer->mutex);
@@ -316,15 +252,14 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
buffer->write_in_progress = true;
/* buffer grows? */
- if (*ppos + count > buffer->bin_buffer_size) {
-
- if (buffer->cb_max_size &&
- *ppos + count > buffer->cb_max_size) {
+ end_offset = iocb->ki_pos + iov_iter_count(from);
+ if (end_offset > buffer->bin_buffer_size) {
+ if (buffer->cb_max_size && end_offset > buffer->cb_max_size) {
len = -EFBIG;
goto out;
}
- tbuf = vmalloc(*ppos + count);
+ tbuf = vmalloc(end_offset);
if (tbuf == NULL) {
len = -ENOMEM;
goto out;
@@ -339,16 +274,17 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
/* clear the new area */
memset(tbuf + buffer->bin_buffer_size, 0,
- *ppos + count - buffer->bin_buffer_size);
+ end_offset - buffer->bin_buffer_size);
buffer->bin_buffer = tbuf;
- buffer->bin_buffer_size = *ppos + count;
+ buffer->bin_buffer_size = end_offset;
}
- len = simple_write_to_buffer(buffer->bin_buffer,
- buffer->bin_buffer_size, ppos, buf, count);
+ len = copy_from_iter(buffer->bin_buffer + iocb->ki_pos,
+ buffer->bin_buffer_size - iocb->ki_pos, from);
+ iocb->ki_pos += len;
out:
mutex_unlock(&buffer->mutex);
- return len;
+ return len ? : -EFAULT;
}
static int __configfs_open_file(struct inode *inode, struct file *file, int type)
@@ -466,11 +402,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file)
{
struct configfs_buffer *buffer = file->private_data;
- buffer->read_in_progress = false;
-
if (buffer->write_in_progress) {
struct configfs_fragment *frag = to_frag(file);
- buffer->write_in_progress = false;
down_read(&frag->frag_sem);
if (!frag->frag_dead) {
@@ -480,29 +413,26 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file)
buffer->bin_buffer_size);
}
up_read(&frag->frag_sem);
- /* vfree on NULL is safe */
- vfree(buffer->bin_buffer);
- buffer->bin_buffer = NULL;
- buffer->bin_buffer_size = 0;
- buffer->needs_read_fill = 1;
}
+ vfree(buffer->bin_buffer);
+
configfs_release(inode, file);
return 0;
}
const struct file_operations configfs_file_operations = {
- .read = configfs_read_file,
- .write = configfs_write_file,
+ .read_iter = configfs_read_iter,
+ .write_iter = configfs_write_iter,
.llseek = generic_file_llseek,
.open = configfs_open_file,
.release = configfs_release,
};
const struct file_operations configfs_bin_file_operations = {
- .read = configfs_read_bin_file,
- .write = configfs_write_bin_file,
+ .read_iter = configfs_bin_read_iter,
+ .write_iter = configfs_bin_write_iter,
.llseek = NULL, /* bin file is not seekable */
.open = configfs_open_bin_file,
.release = configfs_release_bin_file,
@@ -532,7 +462,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
/**
* configfs_create_bin_file - create a binary attribute file for an item.
* @item: item we're creating for.
- * @attr: atrribute descriptor.
+ * @bin_attr: atrribute descriptor.
*/
int configfs_create_bin_file(struct config_item *item,
diff --git a/fs/coredump.c b/fs/coredump.c
index c3d8fc14b993..07afb5ddb1c4 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
task_lock(&init_task);
get_fs_root(init_task.fs, &root);
task_unlock(&init_task);
- cprm.file = file_open_root(root.dentry, root.mnt,
- cn.corename, open_flags, 0600);
+ cprm.file = file_open_root(&root, cn.corename,
+ open_flags, 0600);
path_put(&root);
} else {
cprm.file = filp_open(cn.corename, open_flags, 0600);
diff --git a/fs/d_path.c b/fs/d_path.c
index 270d62133996..23a53f7b5c71 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -8,14 +8,27 @@
#include <linux/prefetch.h>
#include "mount.h"
-static int prepend(char **buffer, int *buflen, const char *str, int namelen)
+struct prepend_buffer {
+ char *buf;
+ int len;
+};
+#define DECLARE_BUFFER(__name, __buf, __len) \
+ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len}
+
+static char *extract_string(struct prepend_buffer *p)
{
- *buflen -= namelen;
- if (*buflen < 0)
- return -ENAMETOOLONG;
- *buffer -= namelen;
- memcpy(*buffer, str, namelen);
- return 0;
+ if (likely(p->len >= 0))
+ return p->buf;
+ return ERR_PTR(-ENAMETOOLONG);
+}
+
+static void prepend(struct prepend_buffer *p, const char *str, int namelen)
+{
+ p->len -= namelen;
+ if (likely(p->len >= 0)) {
+ p->buf -= namelen;
+ memcpy(p->buf, str, namelen);
+ }
}
/**
@@ -35,22 +48,58 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
*
* Load acquire is needed to make sure that we see that terminating NUL.
*/
-static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
+static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
const char *dname = smp_load_acquire(&name->name); /* ^^^ */
u32 dlen = READ_ONCE(name->len);
- char *p;
+ char *s;
- *buflen -= dlen + 1;
- if (*buflen < 0)
- return -ENAMETOOLONG;
- p = *buffer -= dlen + 1;
- *p++ = '/';
+ p->len -= dlen + 1;
+ if (unlikely(p->len < 0))
+ return false;
+ s = p->buf -= dlen + 1;
+ *s++ = '/';
while (dlen--) {
char c = *dname++;
if (!c)
break;
- *p++ = c;
+ *s++ = c;
+ }
+ return true;
+}
+
+static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
+ const struct path *root, struct prepend_buffer *p)
+{
+ while (dentry != root->dentry || &mnt->mnt != root->mnt) {
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+
+ if (dentry == mnt->mnt.mnt_root) {
+ struct mount *m = READ_ONCE(mnt->mnt_parent);
+ struct mnt_namespace *mnt_ns;
+
+ if (likely(mnt != m)) {
+ dentry = READ_ONCE(mnt->mnt_mountpoint);
+ mnt = m;
+ continue;
+ }
+ /* Global root */
+ mnt_ns = READ_ONCE(mnt->mnt_ns);
+ /* open-coded is_mounted() to use local mnt_ns */
+ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
+ return 1; // absolute root
+ else
+ return 2; // detached or not attached yet
+ }
+
+ if (unlikely(dentry == parent))
+ /* Escaped? */
+ return 3;
+
+ prefetch(parent);
+ if (!prepend_name(p, &dentry->d_name))
+ break;
+ dentry = parent;
}
return 0;
}
@@ -74,15 +123,11 @@ static int prepend_name(char **buffer, int *buflen, const struct qstr *name)
*/
static int prepend_path(const struct path *path,
const struct path *root,
- char **buffer, int *buflen)
+ struct prepend_buffer *p)
{
- struct dentry *dentry;
- struct vfsmount *vfsmnt;
- struct mount *mnt;
- int error = 0;
unsigned seq, m_seq = 0;
- char *bptr;
- int blen;
+ struct prepend_buffer b;
+ int error;
rcu_read_lock();
restart_mnt:
@@ -90,50 +135,9 @@ restart_mnt:
seq = 0;
rcu_read_lock();
restart:
- bptr = *buffer;
- blen = *buflen;
- error = 0;
- dentry = path->dentry;
- vfsmnt = path->mnt;
- mnt = real_mount(vfsmnt);
+ b = *p;
read_seqbegin_or_lock(&rename_lock, &seq);
- while (dentry != root->dentry || vfsmnt != root->mnt) {
- struct dentry * parent;
-
- if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
- struct mount *parent = READ_ONCE(mnt->mnt_parent);
- struct mnt_namespace *mnt_ns;
-
- /* Escaped? */
- if (dentry != vfsmnt->mnt_root) {
- bptr = *buffer;
- blen = *buflen;
- error = 3;
- break;
- }
- /* Global root? */
- if (mnt != parent) {
- dentry = READ_ONCE(mnt->mnt_mountpoint);
- mnt = parent;
- vfsmnt = &mnt->mnt;
- continue;
- }
- mnt_ns = READ_ONCE(mnt->mnt_ns);
- /* open-coded is_mounted() to use local mnt_ns */
- if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
- error = 1; // absolute root
- else
- error = 2; // detached or not attached yet
- break;
- }
- parent = dentry->d_parent;
- prefetch(parent);
- error = prepend_name(&bptr, &blen, &dentry->d_name);
- if (error)
- break;
-
- dentry = parent;
- }
+ error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b);
if (!(seq & 1))
rcu_read_unlock();
if (need_seqretry(&rename_lock, seq)) {
@@ -150,14 +154,13 @@ restart:
}
done_seqretry(&mount_lock, m_seq);
- if (error >= 0 && bptr == *buffer) {
- if (--blen < 0)
- error = -ENAMETOOLONG;
- else
- *--bptr = '/';
- }
- *buffer = bptr;
- *buflen = blen;
+ if (unlikely(error == 3))
+ b = *p;
+
+ if (b.len == p->len)
+ prepend(&b, "/", 1);
+
+ *p = b;
return error;
}
@@ -181,56 +184,24 @@ char *__d_path(const struct path *path,
const struct path *root,
char *buf, int buflen)
{
- char *res = buf + buflen;
- int error;
-
- prepend(&res, &buflen, "\0", 1);
- error = prepend_path(path, root, &res, &buflen);
+ DECLARE_BUFFER(b, buf, buflen);
- if (error < 0)
- return ERR_PTR(error);
- if (error > 0)
+ prepend(&b, "", 1);
+ if (unlikely(prepend_path(path, root, &b) > 0))
return NULL;
- return res;
+ return extract_string(&b);
}
char *d_absolute_path(const struct path *path,
char *buf, int buflen)
{
struct path root = {};
- char *res = buf + buflen;
- int error;
-
- prepend(&res, &buflen, "\0", 1);
- error = prepend_path(path, &root, &res, &buflen);
-
- if (error > 1)
- error = -EINVAL;
- if (error < 0)
- return ERR_PTR(error);
- return res;
-}
-
-/*
- * same as __d_path but appends "(deleted)" for unlinked files.
- */
-static int path_with_deleted(const struct path *path,
- const struct path *root,
- char **buf, int *buflen)
-{
- prepend(buf, buflen, "\0", 1);
- if (d_unlinked(path->dentry)) {
- int error = prepend(buf, buflen, " (deleted)", 10);
- if (error)
- return error;
- }
-
- return prepend_path(path, root, buf, buflen);
-}
+ DECLARE_BUFFER(b, buf, buflen);
-static int prepend_unreachable(char **buffer, int *buflen)
-{
- return prepend(buffer, buflen, "(unreachable)", 13);
+ prepend(&b, "", 1);
+ if (unlikely(prepend_path(path, &root, &b) > 1))
+ return ERR_PTR(-EINVAL);
+ return extract_string(&b);
}
static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
@@ -261,9 +232,8 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
*/
char *d_path(const struct path *path, char *buf, int buflen)
{
- char *res = buf + buflen;
+ DECLARE_BUFFER(b, buf, buflen);
struct path root;
- int error;
/*
* We have various synthetic filesystems that never get mounted. On
@@ -282,12 +252,14 @@ char *d_path(const struct path *path, char *buf, int buflen)
rcu_read_lock();
get_fs_root_rcu(current->fs, &root);
- error = path_with_deleted(path, &root, &res, &buflen);
+ if (unlikely(d_unlinked(path->dentry)))
+ prepend(&b, " (deleted)", 11);
+ else
+ prepend(&b, "", 1);
+ prepend_path(path, &root, &b);
rcu_read_unlock();
- if (error < 0)
- res = ERR_PTR(error);
- return res;
+ return extract_string(&b);
}
EXPORT_SYMBOL(d_path);
@@ -314,47 +286,34 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
- char *end = buffer + buflen;
+ DECLARE_BUFFER(b, buffer, buflen);
/* these dentries are never renamed, so d_lock is not needed */
- if (prepend(&end, &buflen, " (deleted)", 11) ||
- prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
- prepend(&end, &buflen, "/", 1))
- end = ERR_PTR(-ENAMETOOLONG);
- return end;
+ prepend(&b, " (deleted)", 11);
+ prepend(&b, dentry->d_name.name, dentry->d_name.len);
+ prepend(&b, "/", 1);
+ return extract_string(&b);
}
/*
* Write full pathname from the root of the filesystem into the buffer.
*/
-static char *__dentry_path(const struct dentry *d, char *buf, int buflen)
+static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
{
const struct dentry *dentry;
- char *end, *retval;
- int len, seq = 0;
- int error = 0;
-
- if (buflen < 2)
- goto Elong;
+ struct prepend_buffer b;
+ int seq = 0;
rcu_read_lock();
restart:
dentry = d;
- end = buf + buflen;
- len = buflen;
- prepend(&end, &len, "\0", 1);
- /* Get '/' right */
- retval = end-1;
- *retval = '/';
+ b = *p;
read_seqbegin_or_lock(&rename_lock, &seq);
while (!IS_ROOT(dentry)) {
const struct dentry *parent = dentry->d_parent;
prefetch(parent);
- error = prepend_name(&end, &len, &dentry->d_name);
- if (error)
+ if (!prepend_name(&b, &dentry->d_name))
break;
-
- retval = end;
dentry = parent;
}
if (!(seq & 1))
@@ -364,36 +323,29 @@ restart:
goto restart;
}
done_seqretry(&rename_lock, seq);
- if (error)
- goto Elong;
- return retval;
-Elong:
- return ERR_PTR(-ENAMETOOLONG);
+ if (b.len == p->len)
+ prepend(&b, "/", 1);
+ return extract_string(&b);
}
char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
- return __dentry_path(dentry, buf, buflen);
+ DECLARE_BUFFER(b, buf, buflen);
+
+ prepend(&b, "", 1);
+ return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);
char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
- char *p = NULL;
- char *retval;
-
- if (d_unlinked(dentry)) {
- p = buf + buflen;
- if (prepend(&p, &buflen, "//deleted", 10) != 0)
- goto Elong;
- buflen++;
- }
- retval = __dentry_path(dentry, buf, buflen);
- if (!IS_ERR(retval) && p)
- *p = '/'; /* restore '/' overriden with '\0' */
- return retval;
-Elong:
- return ERR_PTR(-ENAMETOOLONG);
+ DECLARE_BUFFER(b, buf, buflen);
+
+ if (unlikely(d_unlinked(dentry)))
+ prepend(&b, "//deleted", 10);
+ else
+ prepend(&b, "", 1);
+ return __dentry_path(dentry, &b);
}
static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
@@ -438,38 +390,28 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
rcu_read_lock();
get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
- error = -ENOENT;
- if (!d_unlinked(pwd.dentry)) {
- unsigned long len;
- char *cwd = page + PATH_MAX;
- int buflen = PATH_MAX;
-
- prepend(&cwd, &buflen, "\0", 1);
- error = prepend_path(&pwd, &root, &cwd, &buflen);
+ if (unlikely(d_unlinked(pwd.dentry))) {
rcu_read_unlock();
+ error = -ENOENT;
+ } else {
+ unsigned len;
+ DECLARE_BUFFER(b, page, PATH_MAX);
- if (error < 0)
- goto out;
-
- /* Unreachable from current root */
- if (error > 0) {
- error = prepend_unreachable(&cwd, &buflen);
- if (error)
- goto out;
- }
+ prepend(&b, "", 1);
+ if (unlikely(prepend_path(&pwd, &root, &b) > 0))
+ prepend(&b, "(unreachable)", 13);
+ rcu_read_unlock();
- error = -ERANGE;
- len = PATH_MAX + page - cwd;
- if (len <= size) {
+ len = PATH_MAX - b.len;
+ if (unlikely(len > PATH_MAX))
+ error = -ENAMETOOLONG;
+ else if (unlikely(len > size))
+ error = -ERANGE;
+ else if (copy_to_user(buf, b.buf, len))
+ error = -EFAULT;
+ else
error = len;
- if (copy_to_user(buf, cwd, len))
- error = -EFAULT;
- }
- } else {
- rcu_read_unlock();
}
-
-out:
__putname(page);
return error;
}
diff --git a/fs/dax.c b/fs/dax.c
index da41f9363568..99b4e78d888f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -722,7 +722,7 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ba7c01cd9a5d..df00231d3ecc 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -582,22 +582,12 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
- struct dentry *parent, unsigned long *value)
+void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent,
+ unsigned long *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value,
- &fops_ulong, &fops_ulong_ro,
- &fops_ulong_wo);
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong,
+ &fops_ulong_ro, &fops_ulong_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_ulong);
@@ -846,20 +836,11 @@ static const struct file_operations fops_bool_wo = {
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, bool *value)
+void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent,
+ bool *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
&fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
@@ -980,7 +961,8 @@ static const struct file_operations fops_blob = {
/**
* debugfs_create_blob - create a debugfs file that is used to read a binary blob
* @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have
+ * @mode: the read permission that the file should have (other permissions are
+ * masked out)
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is %NULL, then the
* file will be created in the root of the debugfs filesystem.
@@ -1004,7 +986,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
{
- return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob);
+ return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
diff --git a/fs/exec.c b/fs/exec.c
index f2bcdbeb3afb..38f63451b928 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -84,9 +84,6 @@ static DEFINE_RWLOCK(binfmt_lock);
void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
- BUG_ON(!fmt);
- if (WARN_ON(!fmt->load_binary))
- return;
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index c4523648472a..cb1c0d8c1714 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -63,7 +63,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
- unsigned int type, clu_offset;
+ unsigned int type, clu_offset, max_dentries;
sector_t sector;
struct exfat_chain dir, clu;
struct exfat_uni_name uni_name;
@@ -86,6 +86,8 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
dentries_per_clu = sbi->dentries_per_clu;
dentries_per_clu_bits = ilog2(dentries_per_clu);
+ max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES,
+ (u64)sbi->num_clusters << dentries_per_clu_bits);
clu_offset = dentry >> dentries_per_clu_bits;
exfat_chain_dup(&clu, &dir);
@@ -109,7 +111,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
}
}
- while (clu.dir != EXFAT_EOF_CLUSTER) {
+ while (clu.dir != EXFAT_EOF_CLUSTER && dentry < max_dentries) {
i = dentry & (dentries_per_clu - 1);
for ( ; i < dentries_per_clu; i++, dentry++) {
@@ -245,7 +247,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx)
if (err)
goto unlock;
get_new:
- if (cpos >= i_size_read(inode))
+ if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode))
goto end_of_dir;
err = exfat_readdir(inode, &cpos, &de);
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index d38d17a77e76..5539ffc20d16 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -690,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
if (!sb->s_root) {
exfat_err(sb, "failed to get the root dentry");
err = -ENOMEM;
- goto put_inode;
+ goto free_table;
}
return 0;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 14292dba3a12..2c2f179b6977 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -106,12 +106,11 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
return err;
}
-static bool ext2_check_page(struct page *page, int quiet)
+static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
unsigned chunk_size = ext2_chunk_size(dir);
- char *kaddr = page_address(page);
u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
unsigned offs, rec_len;
unsigned limit = PAGE_SIZE;
@@ -205,7 +204,8 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n,
if (!IS_ERR(page)) {
*page_addr = kmap_local_page(page);
if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !ext2_check_page(page, quiet))
+ if (PageError(page) || !ext2_check_page(page, quiet,
+ *page_addr))
goto fail;
}
}
@@ -584,10 +584,10 @@ out_unlock:
* ext2_delete_entry deletes a directory entry by merging it with the
* previous entry. Page is up-to-date.
*/
-int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
+int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page,
+ char *kaddr)
{
struct inode *inode = page->mapping->host;
- char *kaddr = page_address(page);
unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
unsigned to = ((char *)dir - kaddr) +
ext2_rec_len_from_disk(dir->rec_len);
@@ -607,7 +607,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
de = ext2_next_entry(de);
}
if (pde)
- from = (char*)pde - (char*)page_address(page);
+ from = (char *)pde - kaddr;
pos = page_offset(page) + from;
lock_page(page);
err = ext2_prepare_chunk(page, pos, to - from);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index b0a694820cb7..e512630cb63e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -740,7 +740,8 @@ extern int ext2_inode_by_name(struct inode *dir,
extern int ext2_make_empty(struct inode *, struct inode *);
extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
struct page **, void **res_page_addr);
-extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page,
+ char *kaddr);
extern int ext2_empty_dir (struct inode *);
extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa);
extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 1f69b81655b6..5f6b7560eb3f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -293,7 +293,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
goto out;
}
- err = ext2_delete_entry (de, page);
+ err = ext2_delete_entry (de, page, page_addr);
ext2_put_page(page, page_addr);
if (err)
goto out;
@@ -397,7 +397,7 @@ static int ext2_rename (struct user_namespace * mnt_userns,
old_inode->i_ctime = current_time(old_inode);
mark_inode_dirty(old_inode);
- ext2_delete_entry(old_de, old_page);
+ ext2_delete_entry(old_de, old_page, old_page_addr);
if (dir_de) {
if (old_dir != new_dir)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index be799040a415..b60f0152ea57 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -244,9 +244,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
*/
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
int is_metadata, struct inode *inode,
@@ -327,6 +324,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
set_buffer_meta(bh);
set_buffer_prio(bh);
+ set_buffer_uptodate(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
/* Errors can only happen due to aborted journal or a nasty bug */
@@ -355,7 +353,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
err);
}
} else {
- set_buffer_uptodate(bh);
if (inode)
mark_buffer_dirty_inode(bh, inode);
else
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e27f34bceb8d..6eed6170aded 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -692,6 +692,13 @@ static long ext4_ioctl_group_add(struct file *file,
if (err)
return err;
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_add_out;
+ }
+
err = mnt_want_write_file(file);
if (err)
goto group_add_out;
@@ -816,7 +823,7 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
if (!EXT4_SB(sb)->s_journal)
return -ENODEV;
- if (flags & ~JBD2_JOURNAL_FLUSH_VALID)
+ if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID)
return -EINVAL;
q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev);
@@ -914,6 +921,13 @@ setversion_out:
goto group_extend_out;
}
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
+ }
+
err = mnt_want_write_file(filp);
if (err)
goto group_extend_out;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c2c22c2baac0..089c958aa2c3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1909,10 +1909,11 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
/* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
- ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
- "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
- block, order, needed, ex->fe_group, ex->fe_start,
- ex->fe_len, ex->fe_logical);
+ ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
+ "corruption or bug in mb_find_extent "
+ "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
+ block, order, needed, ex->fe_group, ex->fe_start,
+ ex->fe_len, ex->fe_logical);
ex->fe_len = 0;
ex->fe_start = 0;
ex->fe_group = 0;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6cb598b549ca..cebea4270817 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -138,7 +138,7 @@ static int kmmpd(void *data)
unsigned mmp_check_interval;
unsigned long last_update_time;
unsigned long diff;
- int retval;
+ int retval = 0;
mmp_block = le64_to_cpu(es->s_mmp_block);
mmp = (struct mmp_struct *)(bh->b_data);
@@ -156,7 +156,12 @@ static int kmmpd(void *data)
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop()) {
+ while (!kthread_should_stop() && !sb_rdonly(sb)) {
+ if (!ext4_has_feature_mmp(sb)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ goto wait_to_exit;
+ }
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
@@ -177,16 +182,6 @@ static int kmmpd(void *data)
failed_writes++;
}
- if (!(le32_to_cpu(es->s_feature_incompat) &
- EXT4_FEATURE_INCOMPAT_MMP)) {
- ext4_warning(sb, "kmmpd being stopped since MMP feature"
- " has been disabled.");
- goto exit_thread;
- }
-
- if (sb_rdonly(sb))
- break;
-
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
@@ -207,7 +202,7 @@ static int kmmpd(void *data)
ext4_error_err(sb, -retval,
"error reading MMP data: %d",
retval);
- goto exit_thread;
+ goto wait_to_exit;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -221,7 +216,7 @@ static int kmmpd(void *data)
ext4_error_err(sb, EBUSY, "abort");
put_bh(bh_check);
retval = -EBUSY;
- goto exit_thread;
+ goto wait_to_exit;
}
put_bh(bh_check);
}
@@ -244,7 +239,13 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
-exit_thread:
+wait_to_exit:
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop())
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
return retval;
}
@@ -391,5 +392,3 @@ failed:
brelse(bh);
return 1;
}
-
-
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5fd56f616cf0..f3bbcd4efb56 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2517,7 +2517,7 @@ again:
goto journal_error;
err = ext4_handle_dirty_dx_node(handle, dir,
frame->bh);
- if (err)
+ if (restart || err)
goto journal_error;
} else {
struct dx_root *dxroot;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index fc885914c88a..7a9f1adef679 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -74,10 +74,6 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
}
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc");
- return -EOPNOTSUPP;
- }
if (ext4_has_feature_sparse_super2(sb)) {
ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2");
return -EOPNOTSUPP;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 20344633bdd9..dfa09a277b56 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -705,15 +705,23 @@ static void flush_stashed_error_work(struct work_struct *work)
* ext4 error handling code during handling of previous errors.
*/
if (!sb_rdonly(sbi->s_sb) && journal) {
+ struct buffer_head *sbh = sbi->s_sbh;
handle = jbd2_journal_start(journal, 1);
if (IS_ERR(handle))
goto write_directly;
- if (jbd2_journal_get_write_access(handle, sbi->s_sbh)) {
+ if (jbd2_journal_get_write_access(handle, sbh)) {
jbd2_journal_stop(handle);
goto write_directly;
}
ext4_update_super(sbi->s_sb);
- if (jbd2_journal_dirty_metadata(handle, sbi->s_sbh)) {
+ if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
+ ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+
+ if (jbd2_journal_dirty_metadata(handle, sbh)) {
jbd2_journal_stop(handle);
goto write_directly;
}
@@ -1176,7 +1184,6 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_sysfs(sb);
if (sbi->s_journal) {
- jbd2_journal_unregister_shrinker(sbi->s_journal);
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -5168,7 +5175,6 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- jbd2_journal_unregister_shrinker(sbi->s_journal);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
@@ -5494,12 +5500,6 @@ static int ext4_load_journal(struct super_block *sb,
ext4_commit_super(sb);
}
- err = jbd2_journal_register_shrinker(journal);
- if (err) {
- EXT4_SB(sb)->s_journal = NULL;
- goto err_out;
- }
-
return 0;
err_out:
@@ -5985,7 +5985,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
*/
ext4_mark_recovery_complete(sb, es);
}
- ext4_stop_mmpd(sbi);
} else {
/* Make sure we can mount this feature set readwrite */
if (ext4_has_feature_readonly(sb) ||
@@ -6099,6 +6098,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
+
/*
* Some options can be enabled by ext4 and/or by VFS mount flag
* either way we need to make sure it matches in both *flags and
@@ -6132,6 +6134,8 @@ restore_opts:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(to_free[i]);
#endif
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
kfree(orig_data);
return err;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f795049e63d5..6c208108d69c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -444,7 +444,7 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- f2fs_set_page_private(page, 0);
+ set_page_private_reference(page);
return 1;
}
return 0;
@@ -1018,7 +1018,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- f2fs_set_page_private(page, 0);
+ set_page_private_reference(page);
}
void f2fs_remove_dirty_inode(struct inode *inode)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 925a5ca3744a..455561826c7d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -12,9 +12,11 @@
#include <linux/lzo.h>
#include <linux/lz4.h>
#include <linux/zstd.h>
+#include <linux/pagevec.h>
#include "f2fs.h"
#include "node.h"
+#include "segment.h"
#include <trace/events/f2fs.h>
static struct kmem_cache *cic_entry_slab;
@@ -74,7 +76,7 @@ bool f2fs_is_compressed_page(struct page *page)
return false;
if (!page_private(page))
return false;
- if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
+ if (page_private_nonpointer(page))
return false;
f2fs_bug_on(F2FS_M_SB(page->mapping),
@@ -85,8 +87,7 @@ bool f2fs_is_compressed_page(struct page *page)
static void f2fs_set_compressed_page(struct page *page,
struct inode *inode, pgoff_t index, void *data)
{
- SetPagePrivate(page);
- set_page_private(page, (unsigned long)data);
+ attach_page_private(page, (void *)data);
/* i_crypto_info and iv index */
page->index = index;
@@ -589,8 +590,7 @@ static void f2fs_compress_free_page(struct page *page)
{
if (!page)
return;
- set_page_private(page, (unsigned long)NULL);
- ClearPagePrivate(page);
+ detach_page_private(page);
page->mapping = NULL;
unlock_page(page);
mempool_free(page, compress_page_pool);
@@ -738,7 +738,7 @@ out:
return ret;
}
-static void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
struct f2fs_inode_info *fi = F2FS_I(dic->inode);
@@ -837,7 +837,8 @@ out_end_io:
* page being waited on in the cluster, and if so, it decompresses the cluster
* (or in the case of a failure, cleans up without actually decompressing).
*/
-void f2fs_end_read_compressed_page(struct page *page, bool failed)
+void f2fs_end_read_compressed_page(struct page *page, bool failed,
+ block_t blkaddr)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
@@ -847,6 +848,9 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed)
if (failed)
WRITE_ONCE(dic->failed, true);
+ else if (blkaddr)
+ f2fs_cache_compressed_page(sbi, page,
+ dic->inode->i_ino, blkaddr);
if (atomic_dec_and_test(&dic->remaining_pages))
f2fs_decompress_cluster(dic);
@@ -876,7 +880,7 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
return is_page_in_cluster(cc, index);
}
-static bool __cluster_may_compress(struct compress_ctx *cc)
+static bool cluster_has_invalid_data(struct compress_ctx *cc)
{
loff_t i_size = i_size_read(cc->inode);
unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE);
@@ -889,19 +893,22 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
/* beyond EOF */
if (page->index >= nr_pages)
- return false;
+ return true;
}
- return true;
+ return false;
}
-static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
+static int __f2fs_cluster_blocks(struct inode *inode,
+ unsigned int cluster_idx, bool compr)
{
struct dnode_of_data dn;
+ unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+ unsigned int start_idx = cluster_idx <<
+ F2FS_I(inode)->i_log_cluster_size;
int ret;
- set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
- ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc),
- LOOKUP_NODE);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
if (ret) {
if (ret == -ENOENT)
ret = 0;
@@ -912,7 +919,7 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
int i;
ret = 1;
- for (i = 1; i < cc->cluster_size; i++) {
+ for (i = 1; i < cluster_size; i++) {
block_t blkaddr;
blkaddr = data_blkaddr(dn.inode,
@@ -925,6 +932,10 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
ret++;
}
}
+
+ f2fs_bug_on(F2FS_I_SB(inode),
+ !compr && ret != cluster_size &&
+ !is_inode_flag_set(inode, FI_COMPRESS_RELEASED));
}
fail:
f2fs_put_dnode(&dn);
@@ -934,25 +945,15 @@ fail:
/* return # of compressed blocks in compressed cluster */
static int f2fs_compressed_blocks(struct compress_ctx *cc)
{
- return __f2fs_cluster_blocks(cc, true);
+ return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true);
}
/* return # of valid blocks in compressed cluster */
-static int f2fs_cluster_blocks(struct compress_ctx *cc)
-{
- return __f2fs_cluster_blocks(cc, false);
-}
-
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
- struct compress_ctx cc = {
- .inode = inode,
- .log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
- .cluster_size = F2FS_I(inode)->i_cluster_size,
- .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
- };
-
- return f2fs_cluster_blocks(&cc);
+ return __f2fs_cluster_blocks(inode,
+ index >> F2FS_I(inode)->i_log_cluster_size,
+ false);
}
static bool cluster_may_compress(struct compress_ctx *cc)
@@ -961,13 +962,11 @@ static bool cluster_may_compress(struct compress_ctx *cc)
return false;
if (f2fs_is_atomic_file(cc->inode))
return false;
- if (f2fs_is_mmap_file(cc->inode))
- return false;
if (!f2fs_cluster_is_full(cc))
return false;
if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode))))
return false;
- return __cluster_may_compress(cc);
+ return !cluster_has_invalid_data(cc);
}
static void set_cluster_writeback(struct compress_ctx *cc)
@@ -995,21 +994,16 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
struct address_space *mapping = cc->inode->i_mapping;
struct page *page;
- struct dnode_of_data dn;
sector_t last_block_in_bio;
unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
pgoff_t start_idx = start_idx_of_cluster(cc);
int i, ret;
- bool prealloc;
retry:
- ret = f2fs_cluster_blocks(cc);
+ ret = f2fs_is_compressed_cluster(cc->inode, start_idx);
if (ret <= 0)
return ret;
- /* compressed case */
- prealloc = (ret < cc->cluster_size);
-
ret = f2fs_init_compress_ctx(cc);
if (ret)
return ret;
@@ -1067,25 +1061,6 @@ release_and_retry:
}
}
- if (prealloc) {
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
-
- set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
-
- for (i = cc->cluster_size - 1; i > 0; i--) {
- ret = f2fs_get_block(&dn, start_idx + i);
- if (ret) {
- i = cc->cluster_size;
- break;
- }
-
- if (dn.data_blkaddr != NEW_ADDR)
- break;
- }
-
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
- }
-
if (likely(!ret)) {
*fsdata = cc->rpages;
*pagep = cc->rpages[offset_in_cluster(cc, index)];
@@ -1216,6 +1191,12 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ mapping_set_error(cc->rpages[0]->mapping, -EIO);
+ goto out_free;
+ }
+
if (IS_NOQUOTA(inode)) {
/*
* We need to wait for node_write to avoid block allocation during
@@ -1399,7 +1380,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
for (i = 0; i < cic->nr_rpages; i++) {
WARN_ON(!cic->rpages[i]);
- clear_cold_data(cic->rpages[i]);
+ clear_page_private_gcing(cic->rpages[i]);
end_page_writeback(cic->rpages[i]);
}
@@ -1685,6 +1666,164 @@ void f2fs_put_page_dic(struct page *page)
f2fs_put_dic(dic);
}
+const struct address_space_operations f2fs_compress_aops = {
+ .releasepage = f2fs_release_page,
+ .invalidatepage = f2fs_invalidate_page,
+};
+
+struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
+{
+ return sbi->compress_inode->i_mapping;
+}
+
+void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ if (!sbi->compress_inode)
+ return;
+ invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr);
+}
+
+void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+ nid_t ino, block_t blkaddr)
+{
+ struct page *cpage;
+ int ret;
+
+ if (!test_opt(sbi, COMPRESS_CACHE))
+ return;
+
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
+ return;
+
+ if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE))
+ return;
+
+ cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr);
+ if (cpage) {
+ f2fs_put_page(cpage, 0);
+ return;
+ }
+
+ cpage = alloc_page(__GFP_NOWARN | __GFP_IO);
+ if (!cpage)
+ return;
+
+ ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi),
+ blkaddr, GFP_NOFS);
+ if (ret) {
+ f2fs_put_page(cpage, 0);
+ return;
+ }
+
+ set_page_private_data(cpage, ino);
+
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
+ goto out;
+
+ memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
+ SetPageUptodate(cpage);
+out:
+ f2fs_put_page(cpage, 1);
+}
+
+bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blkaddr)
+{
+ struct page *cpage;
+ bool hitted = false;
+
+ if (!test_opt(sbi, COMPRESS_CACHE))
+ return false;
+
+ cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi),
+ blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS);
+ if (cpage) {
+ if (PageUptodate(cpage)) {
+ atomic_inc(&sbi->compress_page_hit);
+ memcpy(page_address(page),
+ page_address(cpage), PAGE_SIZE);
+ hitted = true;
+ }
+ f2fs_put_page(cpage, 1);
+ }
+
+ return hitted;
+}
+
+void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct address_space *mapping = sbi->compress_inode->i_mapping;
+ struct pagevec pvec;
+ pgoff_t index = 0;
+ pgoff_t end = MAX_BLKADDR(sbi);
+
+ if (!mapping->nrpages)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned int nr_pages;
+ int i;
+
+ nr_pages = pagevec_lookup_range(&pvec, mapping,
+ &index, end - 1);
+ if (!nr_pages)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+
+ lock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (ino != get_page_private_data(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ generic_error_remove_page(mapping, page);
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ } while (index < end);
+}
+
+int f2fs_init_compress_inode(struct f2fs_sb_info *sbi)
+{
+ struct inode *inode;
+
+ if (!test_opt(sbi, COMPRESS_CACHE))
+ return 0;
+
+ inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi));
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->compress_inode = inode;
+
+ sbi->compress_percent = COMPRESS_PERCENT;
+ sbi->compress_watermark = COMPRESS_WATERMARK;
+
+ atomic_set(&sbi->compress_page_hit, 0);
+
+ return 0;
+}
+
+void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi)
+{
+ if (!sbi->compress_inode)
+ return;
+ iput(sbi->compress_inode);
+ sbi->compress_inode = NULL;
+}
+
int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 009a09fb9d88..d2cf48c5a2e4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -58,18 +58,19 @@ static bool __is_cp_guaranteed(struct page *page)
if (!mapping)
return false;
- if (f2fs_is_compressed_page(page))
- return false;
-
inode = mapping->host;
sbi = F2FS_I_SB(inode);
if (inode->i_ino == F2FS_META_INO(sbi) ||
inode->i_ino == F2FS_NODE_INO(sbi) ||
- S_ISDIR(inode->i_mode) ||
- (S_ISREG(inode->i_mode) &&
+ S_ISDIR(inode->i_mode))
+ return true;
+
+ if (f2fs_is_compressed_page(page))
+ return false;
+ if ((S_ISREG(inode->i_mode) &&
(f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
- is_cold_data(page))
+ page_private_gcing(page))
return true;
return false;
}
@@ -131,7 +132,7 @@ static void f2fs_finish_read_bio(struct bio *bio)
if (f2fs_is_compressed_page(page)) {
if (bio->bi_status)
- f2fs_end_read_compressed_page(page, true);
+ f2fs_end_read_compressed_page(page, true, 0);
f2fs_put_page_dic(page);
continue;
}
@@ -227,15 +228,19 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bool all_compressed = true;
+ block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector);
bio_for_each_segment_all(bv, ctx->bio, iter_all) {
struct page *page = bv->bv_page;
/* PG_error was set if decryption failed. */
if (f2fs_is_compressed_page(page))
- f2fs_end_read_compressed_page(page, PageError(page));
+ f2fs_end_read_compressed_page(page, PageError(page),
+ blkaddr);
else
all_compressed = false;
+
+ blkaddr++;
}
/*
@@ -299,9 +304,8 @@ static void f2fs_write_end_io(struct bio *bio)
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
- if (IS_DUMMY_WRITTEN_PAGE(page)) {
- set_page_private(page, (unsigned long)NULL);
- ClearPagePrivate(page);
+ if (page_private_dummy(page)) {
+ clear_page_private_dummy(page);
unlock_page(page);
mempool_free(page, sbi->write_io_dummy);
@@ -331,7 +335,7 @@ static void f2fs_write_end_io(struct bio *bio)
dec_page_count(sbi, type);
if (f2fs_in_warm_node_list(sbi, page))
f2fs_del_fsync_node_entry(sbi, page);
- clear_cold_data(page);
+ clear_page_private_gcing(page);
end_page_writeback(page);
}
if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
@@ -455,10 +459,11 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
GFP_NOIO | __GFP_NOFAIL);
f2fs_bug_on(sbi, !page);
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPagePrivate(page);
- set_page_private(page, DUMMY_WRITTEN_PAGE);
lock_page(page);
+
+ zero_user_segment(page, 0, PAGE_SIZE);
+ set_page_private_dummy(page);
+
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
f2fs_bug_on(sbi, 1);
}
@@ -1351,9 +1356,11 @@ alloc:
old_blkaddr = dn->data_blkaddr;
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
&sum, seg_type, NULL);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
invalidate_mapping_pages(META_MAPPING(sbi),
old_blkaddr, old_blkaddr);
+ f2fs_invalidate_compress_page(sbi, old_blkaddr);
+ }
f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
/*
@@ -2173,7 +2180,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
goto out_put_dnode;
}
- for (i = 0; i < dic->nr_cpages; i++) {
+ for (i = 0; i < cc->nr_cpages; i++) {
struct page *page = dic->cpages[i];
block_t blkaddr;
struct bio_post_read_ctx *ctx;
@@ -2181,6 +2188,14 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
+ f2fs_wait_on_block_writeback(inode, blkaddr);
+
+ if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
+ if (atomic_dec_and_test(&dic->remaining_pages))
+ f2fs_decompress_cluster(dic);
+ continue;
+ }
+
if (bio && (!page_is_mergeable(sbi, bio,
*last_block_in_bio, blkaddr) ||
!f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
@@ -2202,8 +2217,6 @@ submit_and_realloc:
}
}
- f2fs_wait_on_block_writeback(inode, blkaddr);
-
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
@@ -2459,6 +2472,10 @@ static inline bool check_inplace_update_policy(struct inode *inode,
bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio)
{
+ /* swap file is migrating in aligned write mode */
+ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
+ return false;
+
if (f2fs_is_pinned_file(inode))
return true;
@@ -2481,10 +2498,15 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (f2fs_is_atomic_file(inode))
return true;
+
+ /* swap file is migrating in aligned write mode */
+ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
+ return true;
+
if (fio) {
- if (is_cold_data(fio->page))
+ if (page_private_gcing(fio->page))
return true;
- if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+ if (page_private_dummy(fio->page))
return true;
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
@@ -2540,7 +2562,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
- clear_cold_data(page);
+ clear_page_private_gcing(page);
goto out_writepage;
}
got_it:
@@ -2750,7 +2772,7 @@ out:
inode_dec_dirty_pages(inode);
if (err) {
ClearPageUptodate(page);
- clear_cold_data(page);
+ clear_page_private_gcing(page);
}
if (wbc->for_reclaim) {
@@ -3224,7 +3246,7 @@ restart:
f2fs_do_read_inline_data(page, ipage);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
- set_inline_node(ipage);
+ set_page_private_inline(ipage);
} else {
err = f2fs_convert_inline_page(&dn, page);
if (err)
@@ -3615,12 +3637,20 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
}
}
- clear_cold_data(page);
+ clear_page_private_gcing(page);
+
+ if (test_opt(sbi, COMPRESS_CACHE)) {
+ if (f2fs_compressed_file(inode))
+ f2fs_invalidate_compress_pages(sbi, inode->i_ino);
+ if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+ clear_page_private_data(page);
+ }
- if (IS_ATOMIC_WRITTEN_PAGE(page))
+ if (page_private_atomic(page))
return f2fs_drop_inmem_page(inode, page);
- f2fs_clear_page_private(page);
+ detach_page_private(page);
+ set_page_private(page, 0);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -3630,11 +3660,23 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 0;
/* This is atomic written page, keep Private */
- if (IS_ATOMIC_WRITTEN_PAGE(page))
+ if (page_private_atomic(page))
return 0;
- clear_cold_data(page);
- f2fs_clear_page_private(page);
+ if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ struct inode *inode = page->mapping->host;
+
+ if (f2fs_compressed_file(inode))
+ f2fs_invalidate_compress_pages(sbi, inode->i_ino);
+ if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+ clear_page_private_data(page);
+ }
+
+ clear_page_private_gcing(page);
+
+ detach_page_private(page);
+ set_page_private(page, 0);
return 1;
}
@@ -3650,7 +3692,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
- if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
+ if (!page_private_atomic(page)) {
f2fs_register_inmem_page(inode, page);
return 1;
}
@@ -3742,7 +3784,7 @@ int f2fs_migrate_page(struct address_space *mapping,
{
int rc, extra_count;
struct f2fs_inode_info *fi = F2FS_I(mapping->host);
- bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page);
+ bool atomic_written = page_private_atomic(page);
BUG_ON(PageWriteback(page));
@@ -3777,9 +3819,16 @@ int f2fs_migrate_page(struct address_space *mapping,
get_page(newpage);
}
+ /* guarantee to start from no stale private field */
+ set_page_private(newpage, 0);
if (PagePrivate(page)) {
- f2fs_set_page_private(newpage, page_private(page));
- f2fs_clear_page_private(page);
+ set_page_private(newpage, page_private(page));
+ SetPagePrivate(newpage);
+ get_page(newpage);
+
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ put_page(page);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -3792,67 +3841,66 @@ int f2fs_migrate_page(struct address_space *mapping,
#endif
#ifdef CONFIG_SWAP
-static int f2fs_is_file_aligned(struct inode *inode)
+static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
+ unsigned int blkcnt)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- block_t main_blkaddr = SM_I(sbi)->main_blkaddr;
- block_t cur_lblock;
- block_t last_lblock;
- block_t pblock;
- unsigned long nr_pblocks;
- unsigned int blocks_per_sec = BLKS_PER_SEC(sbi);
- unsigned int not_aligned = 0;
+ unsigned int blkofs;
+ unsigned int blk_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int secidx = start_blk / blk_per_sec;
+ unsigned int end_sec = secidx + blkcnt / blk_per_sec;
int ret = 0;
- cur_lblock = 0;
- last_lblock = bytes_to_blks(inode, i_size_read(inode));
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
- while (cur_lblock < last_lblock) {
- struct f2fs_map_blocks map;
+ set_inode_flag(inode, FI_ALIGNED_WRITE);
- memset(&map, 0, sizeof(map));
- map.m_lblk = cur_lblock;
- map.m_len = last_lblock - cur_lblock;
- map.m_next_pgofs = NULL;
- map.m_next_extent = NULL;
- map.m_seg_type = NO_CHECK_TYPE;
- map.m_may_create = false;
+ for (; secidx < end_sec; secidx++) {
+ down_write(&sbi->pin_sem);
- ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
- if (ret)
- goto out;
+ f2fs_lock_op(sbi);
+ f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
+ f2fs_unlock_op(sbi);
- /* hole */
- if (!(map.m_flags & F2FS_MAP_FLAGS)) {
- f2fs_err(sbi, "Swapfile has holes\n");
- ret = -ENOENT;
- goto out;
- }
+ set_inode_flag(inode, FI_DO_DEFRAG);
- pblock = map.m_pblk;
- nr_pblocks = map.m_len;
+ for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
+ struct page *page;
+ unsigned int blkidx = secidx * blk_per_sec + blkofs;
- if ((pblock - main_blkaddr) & (blocks_per_sec - 1) ||
- nr_pblocks & (blocks_per_sec - 1)) {
- if (f2fs_is_pinned_file(inode)) {
- f2fs_err(sbi, "Swapfile does not align to section");
- ret = -EINVAL;
- goto out;
+ page = f2fs_get_lock_data_page(inode, blkidx, true);
+ if (IS_ERR(page)) {
+ up_write(&sbi->pin_sem);
+ ret = PTR_ERR(page);
+ goto done;
}
- not_aligned++;
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
}
- cur_lblock += nr_pblocks;
+ clear_inode_flag(inode, FI_DO_DEFRAG);
+
+ ret = filemap_fdatawrite(inode->i_mapping);
+
+ up_write(&sbi->pin_sem);
+
+ if (ret)
+ break;
}
- if (not_aligned)
- f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n"
- "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()",
- not_aligned);
-out:
+
+done:
+ clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_ALIGNED_WRITE);
+
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+
return ret;
}
-static int check_swap_activate_fast(struct swap_info_struct *sis,
+static int check_swap_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *span)
{
struct address_space *mapping = swap_file->f_mapping;
@@ -3865,7 +3913,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
sector_t highest_pblock = 0;
int nr_extents = 0;
unsigned long nr_pblocks;
- unsigned int blocks_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int blks_per_sec = BLKS_PER_SEC(sbi);
+ unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1;
unsigned int not_aligned = 0;
int ret = 0;
@@ -3878,7 +3927,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
while (cur_lblock < last_lblock && cur_lblock < sis->max) {
struct f2fs_map_blocks map;
-
+retry:
cond_resched();
memset(&map, 0, sizeof(map));
@@ -3895,7 +3944,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
/* hole */
if (!(map.m_flags & F2FS_MAP_FLAGS)) {
- f2fs_err(sbi, "Swapfile has holes\n");
+ f2fs_err(sbi, "Swapfile has holes");
ret = -EINVAL;
goto out;
}
@@ -3903,16 +3952,28 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
pblock = map.m_pblk;
nr_pblocks = map.m_len;
- if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) ||
- nr_pblocks & (blocks_per_sec - 1)) {
- if (f2fs_is_pinned_file(inode)) {
- f2fs_err(sbi, "Swapfile does not align to section");
- ret = -EINVAL;
- goto out;
- }
+ if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask ||
+ nr_pblocks & sec_blks_mask) {
not_aligned++;
- }
+ nr_pblocks = roundup(nr_pblocks, blks_per_sec);
+ if (cur_lblock + nr_pblocks > sis->max)
+ nr_pblocks -= blks_per_sec;
+
+ if (!nr_pblocks) {
+ /* this extent is last one */
+ nr_pblocks = map.m_len;
+ f2fs_warn(sbi, "Swapfile: last extent is not aligned to section");
+ goto next;
+ }
+
+ ret = f2fs_migrate_blocks(inode, cur_lblock,
+ nr_pblocks);
+ if (ret)
+ goto out;
+ goto retry;
+ }
+next:
if (cur_lblock + nr_pblocks >= sis->max)
nr_pblocks = sis->max - cur_lblock;
@@ -3939,120 +4000,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
sis->max = cur_lblock;
sis->pages = cur_lblock - 1;
sis->highest_bit = cur_lblock - 1;
-
- if (not_aligned)
- f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n"
- "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()",
- not_aligned);
-out:
- return ret;
-}
-
-/* Copied from generic_swapfile_activate() to check any holes */
-static int check_swap_activate(struct swap_info_struct *sis,
- struct file *swap_file, sector_t *span)
-{
- struct address_space *mapping = swap_file->f_mapping;
- struct inode *inode = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- unsigned blocks_per_page;
- unsigned long page_no;
- sector_t probe_block;
- sector_t last_block;
- sector_t lowest_block = -1;
- sector_t highest_block = 0;
- int nr_extents = 0;
- int ret = 0;
-
- if (PAGE_SIZE == F2FS_BLKSIZE)
- return check_swap_activate_fast(sis, swap_file, span);
-
- ret = f2fs_is_file_aligned(inode);
- if (ret)
- goto out;
-
- blocks_per_page = bytes_to_blks(inode, PAGE_SIZE);
-
- /*
- * Map all the blocks into the extent list. This code doesn't try
- * to be very smart.
- */
- probe_block = 0;
- page_no = 0;
- last_block = bytes_to_blks(inode, i_size_read(inode));
- while ((probe_block + blocks_per_page) <= last_block &&
- page_no < sis->max) {
- unsigned block_in_page;
- sector_t first_block;
- sector_t block = 0;
-
- cond_resched();
-
- block = probe_block;
- ret = bmap(inode, &block);
- if (ret)
- goto out;
- if (!block)
- goto bad_bmap;
- first_block = block;
-
- /*
- * It must be PAGE_SIZE aligned on-disk
- */
- if (first_block & (blocks_per_page - 1)) {
- probe_block++;
- goto reprobe;
- }
-
- for (block_in_page = 1; block_in_page < blocks_per_page;
- block_in_page++) {
-
- block = probe_block + block_in_page;
- ret = bmap(inode, &block);
- if (ret)
- goto out;
- if (!block)
- goto bad_bmap;
-
- if (block != first_block + block_in_page) {
- /* Discontiguity */
- probe_block++;
- goto reprobe;
- }
- }
-
- first_block >>= (PAGE_SHIFT - inode->i_blkbits);
- if (page_no) { /* exclude the header page */
- if (first_block < lowest_block)
- lowest_block = first_block;
- if (first_block > highest_block)
- highest_block = first_block;
- }
-
- /*
- * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
- */
- ret = add_swap_extent(sis, page_no, 1, first_block);
- if (ret < 0)
- goto out;
- nr_extents += ret;
- page_no++;
- probe_block += blocks_per_page;
-reprobe:
- continue;
- }
- ret = nr_extents;
- *span = 1 + highest_block - lowest_block;
- if (page_no == 0)
- page_no = 1; /* force Empty message */
- sis->max = page_no;
- sis->pages = page_no - 1;
- sis->highest_bit = page_no - 1;
out:
+ if (not_aligned)
+ f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)",
+ not_aligned, blks_per_sec * F2FS_BLKSIZE);
return ret;
-bad_bmap:
- f2fs_err(sbi, "Swapfile has holes\n");
- return -EINVAL;
}
static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -4067,6 +4019,12 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (f2fs_readonly(F2FS_I_SB(inode)->sb))
return -EROFS;
+ if (f2fs_lfs_mode(F2FS_I_SB(inode))) {
+ f2fs_err(F2FS_I_SB(inode),
+ "Swapfile not supported in LFS mode");
+ return -EINVAL;
+ }
+
ret = f2fs_convert_inline_inode(inode);
if (ret)
return ret;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index c03949a7ccff..833325038ef3 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -152,6 +152,12 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->node_pages = NODE_MAPPING(sbi)->nrpages;
if (sbi->meta_inode)
si->meta_pages = META_MAPPING(sbi)->nrpages;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (sbi->compress_inode) {
+ si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages;
+ si->compress_page_hit = atomic_read(&sbi->compress_page_hit);
+ }
+#endif
si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT];
si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT];
si->sits = MAIN_SEGS(sbi);
@@ -309,6 +315,12 @@ get_cache:
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (sbi->compress_inode) {
+ unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+ }
+#endif
}
static int stat_show(struct seq_file *s, void *v)
@@ -476,6 +488,7 @@ static int stat_show(struct seq_file *s, void *v)
"volatile IO: %4d (Max. %4d)\n",
si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
si->vw_cnt, si->max_vw_cnt);
+ seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index dc7ce79672b8..456651682daf 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -16,6 +16,10 @@
#include "xattr.h"
#include <trace/events/f2fs.h>
+#ifdef CONFIG_UNICODE
+extern struct kmem_cache *f2fs_cf_name_slab;
+#endif
+
static unsigned long dir_blocks(struct inode *inode)
{
return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
@@ -77,11 +81,10 @@ int f2fs_init_casefolded_name(const struct inode *dir,
{
#ifdef CONFIG_UNICODE
struct super_block *sb = dir->i_sb;
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (IS_CASEFOLDED(dir)) {
- fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN,
- GFP_NOFS);
+ fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab,
+ GFP_NOFS);
if (!fname->cf_name.name)
return -ENOMEM;
fname->cf_name.len = utf8_casefold(sb->s_encoding,
@@ -89,7 +92,7 @@ int f2fs_init_casefolded_name(const struct inode *dir,
fname->cf_name.name,
F2FS_NAME_LEN);
if ((int)fname->cf_name.len <= 0) {
- kfree(fname->cf_name.name);
+ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
if (sb_has_strict_encoding(sb))
return -EINVAL;
@@ -172,8 +175,10 @@ void f2fs_free_filename(struct f2fs_filename *fname)
fname->crypto_buf.name = NULL;
#endif
#ifdef CONFIG_UNICODE
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
+ if (fname->cf_name.name) {
+ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
+ fname->cf_name.name = NULL;
+ }
#endif
}
@@ -929,11 +934,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
- f2fs_clear_page_private(page);
ClearPageUptodate(page);
- clear_cold_data(page);
+
+ clear_page_private_gcing(page);
+
inode_dec_dirty_pages(dir);
f2fs_remove_dirty_inode(dir);
+
+ detach_page_private(page);
+ set_page_private(page, 0);
}
f2fs_put_page(page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c83d90125ebd..ee8eb33e2c25 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_ATGC 0x08000000
#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_MOUNT_GC_MERGE 0x20000000
+#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -150,8 +151,10 @@ struct f2fs_mount_info {
unsigned char compress_level; /* compress level */
bool compress_chksum; /* compressed data chksum */
unsigned char compress_ext_cnt; /* extension count */
+ unsigned char nocompress_ext_cnt; /* nocompress extension count */
int compress_mode; /* compression mode */
unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
+ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
};
#define F2FS_FEATURE_ENCRYPT 0x0001
@@ -168,6 +171,7 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_SB_CHKSUM 0x0800
#define F2FS_FEATURE_CASEFOLD 0x1000
#define F2FS_FEATURE_COMPRESSION 0x2000
+#define F2FS_FEATURE_RO 0x4000
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -706,6 +710,8 @@ enum {
FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */
FI_MMAP_FILE, /* indicate file was mmapped */
FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */
+ FI_COMPRESS_RELEASED, /* compressed blocks were released */
+ FI_ALIGNED_WRITE, /* enable aligned write */
FI_MAX, /* max flag, never be used */
};
@@ -939,6 +945,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
#define NR_CURSEG_DATA_TYPE (3)
#define NR_CURSEG_NODE_TYPE (3)
#define NR_CURSEG_INMEM_TYPE (2)
+#define NR_CURSEG_RO_TYPE (2)
#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE)
@@ -1291,17 +1298,119 @@ enum {
*/
};
+static inline int f2fs_test_bit(unsigned int nr, char *addr);
+static inline void f2fs_set_bit(unsigned int nr, char *addr);
+static inline void f2fs_clear_bit(unsigned int nr, char *addr);
+
/*
- * this value is set in page as a private data which indicate that
- * the page is atomically written, and it is in inmem_pages list.
+ * Layout of f2fs page.private:
+ *
+ * Layout A: lowest bit should be 1
+ * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... |
+ * bit 0 PAGE_PRIVATE_NOT_POINTER
+ * bit 1 PAGE_PRIVATE_ATOMIC_WRITE
+ * bit 2 PAGE_PRIVATE_DUMMY_WRITE
+ * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION
+ * bit 4 PAGE_PRIVATE_INLINE_INODE
+ * bit 5 PAGE_PRIVATE_REF_RESOURCE
+ * bit 6- f2fs private data
+ *
+ * Layout B: lowest bit should be 0
+ * page.private is a wrapped pointer.
*/
-#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
-#define DUMMY_WRITTEN_PAGE ((unsigned long)-2)
+enum {
+ PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */
+ PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */
+ PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */
+ PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */
+ PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */
+ PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */
+ PAGE_PRIVATE_MAX
+};
-#define IS_ATOMIC_WRITTEN_PAGE(page) \
- (page_private(page) == ATOMIC_WRITTEN_PAGE)
-#define IS_DUMMY_WRITTEN_PAGE(page) \
- (page_private(page) == DUMMY_WRITTEN_PAGE)
+#define PAGE_PRIVATE_GET_FUNC(name, flagname) \
+static inline bool page_private_##name(struct page *page) \
+{ \
+ return PagePrivate(page) && \
+ test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \
+ test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
+}
+
+#define PAGE_PRIVATE_SET_FUNC(name, flagname) \
+static inline void set_page_private_##name(struct page *page) \
+{ \
+ if (!PagePrivate(page)) { \
+ get_page(page); \
+ SetPagePrivate(page); \
+ set_page_private(page, 0); \
+ } \
+ set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \
+ set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
+}
+
+#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \
+static inline void clear_page_private_##name(struct page *page) \
+{ \
+ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
+ if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \
+ set_page_private(page, 0); \
+ if (PagePrivate(page)) { \
+ ClearPagePrivate(page); \
+ put_page(page); \
+ }\
+ } \
+}
+
+PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER);
+PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE);
+PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE);
+PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION);
+PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE);
+PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE);
+
+PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE);
+PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE);
+PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION);
+PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE);
+PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE);
+
+PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE);
+PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
+PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
+PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE);
+PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE);
+
+static inline unsigned long get_page_private_data(struct page *page)
+{
+ unsigned long data = page_private(page);
+
+ if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data))
+ return 0;
+ return data >> PAGE_PRIVATE_MAX;
+}
+
+static inline void set_page_private_data(struct page *page, unsigned long data)
+{
+ if (!PagePrivate(page)) {
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, 0);
+ }
+ set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page));
+ page_private(page) |= data << PAGE_PRIVATE_MAX;
+}
+
+static inline void clear_page_private_data(struct page *page)
+{
+ page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1;
+ if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) {
+ set_page_private(page, 0);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ put_page(page);
+ }
+ }
+}
/* For compression */
enum compress_algorithm_type {
@@ -1317,6 +1426,9 @@ enum compress_flag {
COMPRESS_MAX_FLAG,
};
+#define COMPRESS_WATERMARK 20
+#define COMPRESS_PERCENT 20
+
#define COMPRESS_DATA_RESERVED_SIZE 4
struct compress_data {
__le32 clen; /* compressed data size */
@@ -1594,6 +1706,9 @@ struct f2fs_sb_info {
struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */
struct completion s_stat_kobj_unregister;
+ struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */
+ struct completion s_feature_list_kobj_unregister;
+
/* For shrinker support */
struct list_head s_list;
int s_ndevs; /* number of devices */
@@ -1626,6 +1741,12 @@ struct f2fs_sb_info {
u64 compr_written_block;
u64 compr_saved_block;
u32 compr_new_inode;
+
+ /* For compressed block cache */
+ struct inode *compress_inode; /* cache compressed blocks */
+ unsigned int compress_percent; /* cache page percentage */
+ unsigned int compress_watermark; /* cache page watermark */
+ atomic_t compress_page_hit; /* cache hit count */
#endif
};
@@ -2678,6 +2799,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
+ case FI_COMPRESS_RELEASED:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -2799,6 +2921,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
set_bit(FI_EXTRA_ATTR, fi->flags);
if (ri->i_inline & F2FS_PIN_FILE)
set_bit(FI_PIN_FILE, fi->flags);
+ if (ri->i_inline & F2FS_COMPRESS_RELEASED)
+ set_bit(FI_COMPRESS_RELEASED, fi->flags);
}
static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2819,6 +2943,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
ri->i_inline |= F2FS_EXTRA_ATTR;
if (is_inode_flag_set(inode, FI_PIN_FILE))
ri->i_inline |= F2FS_PIN_FILE;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+ ri->i_inline |= F2FS_COMPRESS_RELEASED;
}
static inline int f2fs_has_extra_attr(struct inode *inode)
@@ -3027,25 +3153,6 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len)
return false;
}
-static inline bool f2fs_may_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, EXTENT_CACHE) ||
- is_inode_flag_set(inode, FI_NO_EXTENT) ||
- is_inode_flag_set(inode, FI_COMPRESSED_FILE))
- return false;
-
- /*
- * for recovered files during mount do not create extents
- * if shrinker is not registered.
- */
- if (list_empty(&sbi->s_list))
- return false;
-
- return S_ISREG(inode->i_mode);
-}
-
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
@@ -3169,20 +3276,6 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr)
return true;
}
-static inline void f2fs_set_page_private(struct page *page,
- unsigned long data)
-{
- if (PagePrivate(page))
- return;
-
- attach_page_private(page, (void *)data);
-}
-
-static inline void f2fs_clear_page_private(struct page *page)
-{
- detach_page_private(page);
-}
-
/*
* file.c
*/
@@ -3566,6 +3659,8 @@ void f2fs_destroy_garbage_collection_cache(void);
*/
int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only);
bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi);
+int __init f2fs_create_recovery_cache(void);
+void f2fs_destroy_recovery_cache(void);
/*
* debug.c
@@ -3604,7 +3699,8 @@ struct f2fs_stat_info {
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
- int dirty_count, node_pages, meta_pages;
+ int dirty_count, node_pages, meta_pages, compress_pages;
+ int compress_page_hit;
int prefree_count, call_count, cp_count, bg_cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
int bg_node_segs, bg_data_segs;
@@ -3940,7 +4036,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_end_read_compressed_page(struct page *page, bool failed);
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic);
+void f2fs_end_read_compressed_page(struct page *page, bool failed,
+ block_t blkaddr);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
@@ -3958,10 +4056,19 @@ void f2fs_put_page_dic(struct page *page);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
+int f2fs_init_compress_inode(struct f2fs_sb_info *sbi);
+void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi);
int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi);
void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi);
int __init f2fs_init_compress_cache(void);
void f2fs_destroy_compress_cache(void);
+struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi);
+void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+ nid_t ino, block_t blkaddr);
+bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blkaddr);
+void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino);
#define inc_compr_inode_stat(inode) \
do { \
struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \
@@ -3990,7 +4097,9 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
-static inline void f2fs_end_read_compressed_page(struct page *page, bool failed)
+static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { }
+static inline void f2fs_end_read_compressed_page(struct page *page,
+ bool failed, block_t blkaddr)
{
WARN_ON_ONCE(1);
}
@@ -3998,10 +4107,20 @@ static inline void f2fs_put_page_dic(struct page *page)
{
WARN_ON_ONCE(1);
}
+static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
static inline int __init f2fs_init_compress_cache(void) { return 0; }
static inline void f2fs_destroy_compress_cache(void) { }
+static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi,
+ block_t blkaddr) { }
+static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
+ struct page *page, nid_t ino, block_t blkaddr) { }
+static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
+ struct page *page, block_t blkaddr) { return false; }
+static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
+ nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
#endif
@@ -4066,6 +4185,27 @@ F2FS_FEATURE_FUNCS(verity, VERITY);
F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
F2FS_FEATURE_FUNCS(compression, COMPRESSION);
+F2FS_FEATURE_FUNCS(readonly, RO);
+
+static inline bool f2fs_may_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, EXTENT_CACHE) ||
+ is_inode_flag_set(inode, FI_NO_EXTENT) ||
+ (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(sbi)))
+ return false;
+
+ /*
+ * for recovered files during mount do not create extents
+ * if shrinker is not registered.
+ */
+ if (list_empty(&sbi->s_list))
+ return false;
+
+ return S_ISREG(inode->i_mode);
+}
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ceb575f99048..6afd4562335f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,6 +63,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (unlikely(IS_IMMUTABLE(inode)))
return VM_FAULT_SIGBUS;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+ return VM_FAULT_SIGBUS;
+
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto err;
@@ -85,10 +88,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
err = ret;
goto err;
} else if (ret) {
- if (ret < F2FS_I(inode)->i_cluster_size) {
- err = -EAGAIN;
- goto err;
- }
need_alloc = false;
}
}
@@ -117,7 +116,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_block(&dn, page->index);
- f2fs_put_dnode(&dn);
f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
}
@@ -3203,7 +3201,7 @@ int f2fs_precache_extents(struct inode *inode)
map.m_lblk = m_next_extent;
}
- return err;
+ return 0;
}
static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg)
@@ -3237,7 +3235,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg)
if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) {
f2fs_warn(F2FS_I_SB(inode),
- "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n",
+ "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem",
inode->i_ino);
return -EOPNOTSUPP;
}
@@ -3425,7 +3423,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
goto out;
}
- if (IS_IMMUTABLE(inode)) {
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto out;
}
@@ -3434,8 +3432,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (ret)
goto out;
- F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL;
- f2fs_set_inode_flags(inode);
+ set_inode_flag(inode, FI_COMPRESS_RELEASED);
inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, true);
@@ -3590,7 +3587,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
inode_lock(inode);
- if (!IS_IMMUTABLE(inode)) {
+ if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
ret = -EINVAL;
goto unlock_inode;
}
@@ -3635,8 +3632,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
up_write(&F2FS_I(inode)->i_mmap_sem);
if (ret >= 0) {
- F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL;
- f2fs_set_inode_flags(inode);
+ clear_inode_flag(inode, FI_COMPRESS_RELEASED);
inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -4023,9 +4019,8 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
LLONG_MAX);
if (ret)
- f2fs_warn(sbi, "%s: The file might be partially decompressed "
- "(errno=%d). Please delete the file.\n",
- __func__, ret);
+ f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.",
+ __func__, ret);
out:
inode_unlock(inode);
file_end_write(filp);
@@ -4097,9 +4092,8 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg)
clear_inode_flag(inode, FI_ENABLE_COMPRESS);
if (ret)
- f2fs_warn(sbi, "%s: The file might be partially compressed "
- "(errno=%d). Please delete the file.\n",
- __func__, ret);
+ f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.",
+ __func__, ret);
out:
inode_unlock(inode);
file_end_write(filp);
@@ -4254,6 +4248,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto unlock;
}
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
ret = generic_write_checks(iocb, from);
if (ret > 0) {
bool preallocated = false;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 8d1f17ab94d8..0e42ee5f7770 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1031,8 +1031,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (unlikely(check_valid_map(sbi, segno, offset))) {
if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) {
- f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n",
- blkaddr, source_blkaddr, segno);
+ f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u",
+ blkaddr, source_blkaddr, segno);
f2fs_bug_on(sbi, 1);
}
}
@@ -1261,6 +1261,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_put_page(mpage, 1);
invalidate_mapping_pages(META_MAPPING(fio.sbi),
fio.old_blkaddr, fio.old_blkaddr);
+ f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr);
set_page_dirty(fio.encrypted_page);
if (clear_page_dirty_for_io(fio.encrypted_page))
@@ -1336,7 +1337,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
}
set_page_dirty(page);
- set_cold_data(page);
+ set_page_private_gcing(page);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
@@ -1362,11 +1363,11 @@ retry:
f2fs_remove_dirty_inode(inode);
}
- set_cold_data(page);
+ set_page_private_gcing(page);
err = f2fs_do_write_data_page(&fio);
if (err) {
- clear_cold_data(page);
+ clear_page_private_gcing(page);
if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC,
DEFAULT_IO_TIMEOUT);
@@ -1450,10 +1451,8 @@ next_step:
if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode) || is_bad_inode(inode)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
+ if (IS_ERR(inode) || is_bad_inode(inode))
continue;
- }
if (!down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
@@ -1822,6 +1821,7 @@ static void init_atgc_management(struct f2fs_sb_info *sbi)
am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO;
am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT;
am->age_weight = DEF_GC_THREAD_AGE_WEIGHT;
+ am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD;
}
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 92652ca7a7c8..56a20d5c15da 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -173,7 +173,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
/* clear inline data and flag after data writeback */
f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0);
- clear_inline_node(dn->inode_page);
+ clear_page_private_inline(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
clear_inode_flag(dn->inode, FI_INLINE_DATA);
@@ -255,7 +255,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
- clear_inline_node(dn.inode_page);
+ clear_page_private_inline(dn.inode_page);
f2fs_put_dnode(&dn);
return 0;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index b401f08569f7..9141147b5bb0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -18,6 +18,10 @@
#include <trace/events/f2fs.h>
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+extern const struct address_space_operations f2fs_compress_aops;
+#endif
+
void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
{
if (is_inode_flag_set(inode, FI_NEW_INODE))
@@ -494,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
goto make_now;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (ino == F2FS_COMPRESS_INO(sbi))
+ goto make_now;
+#endif
+
ret = do_read_inode(inode);
if (ret)
goto bad_inode;
@@ -504,6 +513,12 @@ make_now:
} else if (ino == F2FS_META_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_meta_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ } else if (ino == F2FS_COMPRESS_INO(sbi)) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ inode->i_mapping->a_ops = &f2fs_compress_aops;
+#endif
+ mapping_set_gfp_mask(inode->i_mapping,
+ GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
} else if (S_ISREG(inode->i_mode)) {
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
@@ -646,7 +661,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
/* deleted inode */
if (inode->i_nlink == 0)
- clear_inline_node(node_page);
+ clear_page_private_inline(node_page);
F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
@@ -723,8 +738,12 @@ void f2fs_evict_inode(struct inode *inode)
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
+ if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode))
+ f2fs_invalidate_compress_pages(sbi, inode->i_ino);
+
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
- inode->i_ino == F2FS_META_INO(sbi))
+ inode->i_ino == F2FS_META_INO(sbi) ||
+ inode->i_ino == F2FS_COMPRESS_INO(sbi))
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a9cd9cf97229..e149c8c66a71 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -153,7 +153,8 @@ fail_drop:
return ERR_PTR(err);
}
-static inline int is_extension_exist(const unsigned char *s, const char *sub)
+static inline int is_extension_exist(const unsigned char *s, const char *sub,
+ bool tmp_ext)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
@@ -169,6 +170,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
if (slen < sublen + 2)
return 0;
+ if (!tmp_ext) {
+ /* file has no temp extension */
+ if (s[slen - sublen - 1] != '.')
+ return 0;
+ return !strncasecmp(s + slen - sublen, sub, sublen);
+ }
+
for (i = 1; i < slen - sublen; i++) {
if (s[i] != '.')
continue;
@@ -194,7 +202,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
hot_count = sbi->raw_super->hot_ext_count;
for (i = 0; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i]))
+ if (is_extension_exist(name, extlist[i], true))
break;
}
@@ -279,14 +287,16 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
{
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- unsigned char (*ext)[F2FS_EXTENSION_LEN];
- unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
+ unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
+ unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
int i, cold_count, hot_count;
if (!f2fs_sb_has_compression(sbi) ||
- is_inode_flag_set(inode, FI_COMPRESSED_FILE) ||
F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
- !f2fs_may_compress(inode))
+ !f2fs_may_compress(inode) ||
+ (!ext_cnt && !noext_cnt))
return;
down_read(&sbi->sb_lock);
@@ -295,7 +305,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
hot_count = sbi->raw_super->hot_ext_count;
for (i = cold_count; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i])) {
+ if (is_extension_exist(name, extlist[i], false)) {
up_read(&sbi->sb_lock);
return;
}
@@ -303,10 +313,18 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
up_read(&sbi->sb_lock);
- ext = F2FS_OPTION(sbi).extensions;
+ for (i = 0; i < noext_cnt; i++) {
+ if (is_extension_exist(name, noext[i], false)) {
+ f2fs_disable_compressed_file(inode);
+ return;
+ }
+ }
+
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+ return;
for (i = 0; i < ext_cnt; i++) {
- if (!is_extension_exist(name, ext[i]))
+ if (!is_extension_exist(name, ext[i], false))
continue;
set_compress_context(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index e67ce5f13b98..0be9e2d7120e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -97,6 +97,20 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
+ } else if (type == COMPRESS_PAGE) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ unsigned long free_ram = val.freeram;
+
+ /*
+ * free memory is lower than watermark or cached page count
+ * exceed threshold, deny caching compress page.
+ */
+ res = (free_ram > avail_ram * sbi->compress_watermark / 100) &&
+ (COMPRESS_MAPPING(sbi)->nrpages <
+ free_ram * sbi->compress_percent / 100);
+#else
+ res = false;
+#endif
} else {
if (!sbi->sb->s_bdi->wb.dirty_exceeded)
return true;
@@ -1535,13 +1549,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
trace_f2fs_writepage(page, NODE);
if (unlikely(f2fs_cp_error(sbi))) {
- if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
- ClearPageUptodate(page);
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- unlock_page(page);
- return 0;
- }
- goto redirty_out;
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ unlock_page(page);
+ return 0;
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -1860,8 +1871,8 @@ continue_unlock:
}
/* flush inline_data, if it's async context. */
- if (is_inline_node(page)) {
- clear_inline_node(page);
+ if (page_private_inline(page)) {
+ clear_page_private_inline(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
continue;
@@ -1941,8 +1952,8 @@ continue_unlock:
goto write_node;
/* flush inline_data */
- if (is_inline_node(page)) {
- clear_inline_node(page);
+ if (page_private_inline(page)) {
+ clear_page_private_inline(page);
unlock_page(page);
flush_inline_data(sbi, ino_of_node(page));
goto lock_node;
@@ -2096,7 +2107,7 @@ static int f2fs_set_node_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- f2fs_set_page_private(page, 0);
+ set_page_private_reference(page);
return 1;
}
return 0;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7a45c0f10629..ff14a6e5ac1c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -38,6 +38,9 @@
/* return value for read_node_page */
#define LOCKED_PAGE 1
+/* check pinned file's alignment status of physical blocks */
+#define FILE_NOT_ALIGNED 1
+
/* For flag in struct node_info */
enum {
IS_CHECKPOINTED, /* is it checkpointed before? */
@@ -148,6 +151,7 @@ enum mem_type {
EXTENT_CACHE, /* indicates extent cache */
INMEM_PAGES, /* indicates inmemory pages */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
+ COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
};
@@ -389,20 +393,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_cold_data(struct page *page)
-{
- return PageChecked(page);
-}
-
-static inline void set_cold_data(struct page *page)
-{
- SetPageChecked(page);
-}
-
-static inline void clear_cold_data(struct page *page)
-{
- ClearPageChecked(page);
-}
static inline int is_node(struct page *page, int type)
{
@@ -414,21 +404,6 @@ static inline int is_node(struct page *page, int type)
#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
-static inline int is_inline_node(struct page *page)
-{
- return PageChecked(page);
-}
-
-static inline void set_inline_node(struct page *page)
-{
- SetPageChecked(page);
-}
-
-static inline void clear_inline_node(struct page *page)
-{
- ClearPageChecked(page);
-}
-
static inline void set_cold_node(struct page *page, bool is_dir)
{
struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 422146c6d866..695eacfe776c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -45,6 +45,10 @@
static struct kmem_cache *fsync_entry_slab;
+#ifdef CONFIG_UNICODE
+extern struct kmem_cache *f2fs_cf_name_slab;
+#endif
+
bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
{
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
@@ -145,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir,
f2fs_hash_filename(dir, fname);
#ifdef CONFIG_UNICODE
/* Case-sensitive match is fine for recovery */
- kfree(fname->cf_name.name);
+ kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
} else {
@@ -788,13 +792,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY);
#endif
- fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
- sizeof(struct fsync_inode_entry));
- if (!fsync_entry_slab) {
- err = -ENOMEM;
- goto out;
- }
-
INIT_LIST_HEAD(&inode_list);
INIT_LIST_HEAD(&tmp_inode_list);
INIT_LIST_HEAD(&dir_list);
@@ -867,8 +864,6 @@ skip:
}
}
- kmem_cache_destroy(fsync_entry_slab);
-out:
#ifdef CONFIG_QUOTA
/* Turn quotas off */
if (quota_enabled)
@@ -878,3 +873,17 @@ out:
return ret ? ret : err;
}
+
+int __init f2fs_create_recovery_cache(void)
+{
+ fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
+ sizeof(struct fsync_inode_entry));
+ if (!fsync_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void f2fs_destroy_recovery_cache(void)
+{
+ kmem_cache_destroy(fsync_entry_slab);
+}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 51dc79fad4fe..15cc89eef28d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -186,10 +186,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
{
struct inmem_pages *new;
- if (PagePrivate(page))
- set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
- else
- f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
+ set_page_private_atomic(page);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -272,9 +269,10 @@ next:
/* we don't need to invalidate this in the sccessful status */
if (drop || recover) {
ClearPageUptodate(page);
- clear_cold_data(page);
+ clear_page_private_gcing(page);
}
- f2fs_clear_page_private(page);
+ detach_page_private(page);
+ set_page_private(page, 0);
f2fs_put_page(page, 1);
list_del(&cur->list);
@@ -357,7 +355,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
struct list_head *head = &fi->inmem_pages;
struct inmem_pages *cur = NULL;
- f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
+ f2fs_bug_on(sbi, !page_private_atomic(page));
mutex_lock(&fi->inmem_lock);
list_for_each_entry(cur, head, list) {
@@ -373,9 +371,12 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
kmem_cache_free(inmem_entry_slab, cur);
ClearPageUptodate(page);
- f2fs_clear_page_private(page);
+ clear_page_private_atomic(page);
f2fs_put_page(page, 0);
+ detach_page_private(page);
+ set_page_private(page, 0);
+
trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
}
@@ -2321,6 +2322,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
return;
invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
+ f2fs_invalidate_compress_page(sbi, addr);
/* add it into sit main buffer */
down_write(&sit_i->sentry_lock);
@@ -3289,7 +3291,10 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (fio->type == DATA) {
struct inode *inode = fio->page->mapping->host;
- if (is_cold_data(fio->page)) {
+ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
+ return CURSEG_COLD_DATA_PINNED;
+
+ if (page_private_gcing(fio->page)) {
if (fio->sbi->am.atgc_enabled &&
(fio->io_type == FS_DATA_IO) &&
(fio->sbi->gc_mode != GC_URGENT_HIGH))
@@ -3468,9 +3473,11 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
- if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
+ if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) {
invalidate_mapping_pages(META_MAPPING(fio->sbi),
fio->old_blkaddr, fio->old_blkaddr);
+ f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr);
+ }
/* writeout dirty page into bdev */
f2fs_submit_page_write(fio);
@@ -3660,6 +3667,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
invalidate_mapping_pages(META_MAPPING(sbi),
old_blkaddr, old_blkaddr);
+ f2fs_invalidate_compress_page(sbi, old_blkaddr);
if (!from_gc)
update_segment_mtime(sbi, old_blkaddr, 0);
update_sit_entry(sbi, old_blkaddr, -1);
@@ -3919,7 +3927,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
/* sanity check for summary blocks */
if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) {
- f2fs_err(sbi, "invalid journal entries nats %u sits %u\n",
+ f2fs_err(sbi, "invalid journal entries nats %u sits %u",
nats_in_cursum(nat_j), sits_in_cursum(sit_j));
return -EINVAL;
}
@@ -4682,6 +4690,10 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
struct seg_entry *se = get_seg_entry(sbi, curseg->segno);
unsigned int blkofs = curseg->next_blkoff;
+ if (f2fs_sb_has_readonly(sbi) &&
+ i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE)
+ continue;
+
sanity_check_seg_type(sbi, curseg->seg_type);
if (f2fs_test_bit(blkofs, se->cur_valid_map))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7d325bfaf65a..8fecd3050ccd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -148,8 +148,10 @@ enum {
Opt_compress_algorithm,
Opt_compress_log_size,
Opt_compress_extension,
+ Opt_nocompress_extension,
Opt_compress_chksum,
Opt_compress_mode,
+ Opt_compress_cache,
Opt_atgc,
Opt_gc_merge,
Opt_nogc_merge,
@@ -222,8 +224,10 @@ static match_table_t f2fs_tokens = {
{Opt_compress_algorithm, "compress_algorithm=%s"},
{Opt_compress_log_size, "compress_log_size=%u"},
{Opt_compress_extension, "compress_extension=%s"},
+ {Opt_nocompress_extension, "nocompress_extension=%s"},
{Opt_compress_chksum, "compress_chksum"},
{Opt_compress_mode, "compress_mode=%s"},
+ {Opt_compress_cache, "compress_cache"},
{Opt_atgc, "atgc"},
{Opt_gc_merge, "gc_merge"},
{Opt_nogc_merge, "nogc_merge"},
@@ -275,6 +279,24 @@ static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb,
return 0;
}
+
+struct kmem_cache *f2fs_cf_name_slab;
+static int __init f2fs_create_casefold_cache(void)
+{
+ f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
+ F2FS_NAME_LEN);
+ if (!f2fs_cf_name_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+static void f2fs_destroy_casefold_cache(void)
+{
+ kmem_cache_destroy(f2fs_cf_name_slab);
+}
+#else
+static int __init f2fs_create_casefold_cache(void) { return 0; }
+static void f2fs_destroy_casefold_cache(void) { }
#endif
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
@@ -473,6 +495,43 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
+/*
+ * 1. The same extension name cannot not appear in both compress and non-compress extension
+ * at the same time.
+ * 2. If the compress extension specifies all files, the types specified by the non-compress
+ * extension will be treated as special cases and will not be compressed.
+ * 3. Don't allow the non-compress extension specifies all files.
+ */
+static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi)
+{
+ unsigned char (*ext)[F2FS_EXTENSION_LEN];
+ unsigned char (*noext)[F2FS_EXTENSION_LEN];
+ int ext_cnt, noext_cnt, index = 0, no_index = 0;
+
+ ext = F2FS_OPTION(sbi).extensions;
+ ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ noext = F2FS_OPTION(sbi).noextensions;
+ noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+
+ if (!noext_cnt)
+ return 0;
+
+ for (no_index = 0; no_index < noext_cnt; no_index++) {
+ if (!strcasecmp("*", noext[no_index])) {
+ f2fs_info(sbi, "Don't allow the nocompress extension specifies all files");
+ return -EINVAL;
+ }
+ for (index = 0; index < ext_cnt; index++) {
+ if (!strcasecmp(ext[index], noext[no_index])) {
+ f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension",
+ ext[index]);
+ return -EINVAL;
+ }
+ }
+ }
+ return 0;
+}
+
#ifdef CONFIG_F2FS_FS_LZ4
static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
{
@@ -546,7 +605,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
substring_t args[MAX_OPT_ARGS];
#ifdef CONFIG_F2FS_FS_COMPRESSION
unsigned char (*ext)[F2FS_EXTENSION_LEN];
- int ext_cnt;
+ unsigned char (*noext)[F2FS_EXTENSION_LEN];
+ int ext_cnt, noext_cnt;
#endif
char *p, *name;
int arg = 0;
@@ -555,7 +615,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
int ret;
if (!options)
- return 0;
+ goto default_check;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -1049,6 +1109,30 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
F2FS_OPTION(sbi).compress_ext_cnt++;
kfree(name);
break;
+ case Opt_nocompress_extension:
+ if (!f2fs_sb_has_compression(sbi)) {
+ f2fs_info(sbi, "Image doesn't support compression");
+ break;
+ }
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+
+ noext = F2FS_OPTION(sbi).noextensions;
+ noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN ||
+ noext_cnt >= COMPRESS_EXT_NUM) {
+ f2fs_err(sbi,
+ "invalid extension length/number");
+ kfree(name);
+ return -EINVAL;
+ }
+
+ strcpy(noext[noext_cnt], name);
+ F2FS_OPTION(sbi).nocompress_ext_cnt++;
+ kfree(name);
+ break;
case Opt_compress_chksum:
F2FS_OPTION(sbi).compress_chksum = true;
break;
@@ -1066,12 +1150,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_compress_cache:
+ set_opt(sbi, COMPRESS_CACHE);
+ break;
#else
case Opt_compress_algorithm:
case Opt_compress_log_size:
case Opt_compress_extension:
+ case Opt_nocompress_extension:
case Opt_compress_chksum:
case Opt_compress_mode:
+ case Opt_compress_cache:
f2fs_info(sbi, "compression options not supported");
break;
#endif
@@ -1090,6 +1179,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
return -EINVAL;
}
}
+default_check:
#ifdef CONFIG_QUOTA
if (f2fs_check_quota_options(sbi))
return -EINVAL;
@@ -1122,6 +1212,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
#endif
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_test_compress_extension(sbi)) {
+ f2fs_err(sbi, "invalid compress or nocompress extension");
+ return -EINVAL;
+ }
+#endif
+
if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
F2FS_IO_SIZE_KB(sbi));
@@ -1153,7 +1250,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n");
+ f2fs_err(sbi, "LFS not compatible with checkpoint=disable");
return -EINVAL;
}
@@ -1162,6 +1259,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
*/
if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE)
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
+
+ if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
+ f2fs_err(sbi, "Allow to mount readonly mode only");
+ return -EROFS;
+ }
return 0;
}
@@ -1403,6 +1505,8 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_bug_on(sbi, sbi->fsync_node_num);
+ f2fs_destroy_compress_inode(sbi);
+
iput(sbi->node_inode);
sbi->node_inode = NULL;
@@ -1665,6 +1769,11 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
F2FS_OPTION(sbi).extensions[i]);
}
+ for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) {
+ seq_printf(seq, ",nocompress_extension=%s",
+ F2FS_OPTION(sbi).noextensions[i]);
+ }
+
if (F2FS_OPTION(sbi).compress_chksum)
seq_puts(seq, ",compress_chksum");
@@ -1672,6 +1781,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
seq_printf(seq, ",compress_mode=%s", "fs");
else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER)
seq_printf(seq, ",compress_mode=%s", "user");
+
+ if (test_opt(sbi, COMPRESS_CACHE))
+ seq_puts(seq, ",compress_cache");
}
#endif
@@ -1819,7 +1931,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
static void default_options(struct f2fs_sb_info *sbi)
{
/* init some FS parameters */
- F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
+ if (f2fs_sb_has_readonly(sbi))
+ F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE;
+ else
+ F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
+
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
@@ -1949,6 +2065,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
+ bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
bool checkpoint_changed;
#ifdef CONFIG_QUOTA
int i, j;
@@ -2004,6 +2121,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
goto skip;
+ if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+
#ifdef CONFIG_QUOTA
if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
err = dquot_suspend(sb, -1);
@@ -2041,6 +2163,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch compress_cache option is not allowed");
+ goto restore_opts;
+ }
+
if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
err = -EINVAL;
f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
@@ -3137,14 +3265,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
- if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS ||
+ if (!f2fs_sb_has_readonly(sbi) &&
+ unlikely(fsmeta < F2FS_MIN_META_SEGMENTS ||
ovp_segments == 0 || reserved_segments == 0)) {
f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version");
return 1;
}
-
user_block_count = le64_to_cpu(ckpt->user_block_count);
- segment_count_main = le32_to_cpu(raw_super->segment_count_main);
+ segment_count_main = le32_to_cpu(raw_super->segment_count_main) +
+ (f2fs_sb_has_readonly(sbi) ? 1 : 0);
log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
if (!user_block_count || user_block_count >=
segment_count_main << log_blocks_per_seg) {
@@ -3175,6 +3304,10 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
return 1;
+
+ if (f2fs_sb_has_readonly(sbi))
+ goto check_data;
+
for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) {
if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
le32_to_cpu(ckpt->cur_node_segno[j])) {
@@ -3185,10 +3318,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
}
}
}
+check_data:
for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
return 1;
+
+ if (f2fs_sb_has_readonly(sbi))
+ goto skip_cross;
+
for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) {
if (le32_to_cpu(ckpt->cur_data_segno[i]) ==
le32_to_cpu(ckpt->cur_data_segno[j])) {
@@ -3210,7 +3348,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
}
}
}
-
+skip_cross:
sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
@@ -3555,7 +3693,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
#ifdef CONFIG_BLK_DEV_ZONED
if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
!f2fs_sb_has_blkzoned(sbi)) {
- f2fs_err(sbi, "Zoned block device feature not enabled\n");
+ f2fs_err(sbi, "Zoned block device feature not enabled");
return -EINVAL;
}
if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
@@ -3940,10 +4078,14 @@ try_onemore:
goto free_node_inode;
}
- err = f2fs_register_sysfs(sbi);
+ err = f2fs_init_compress_inode(sbi);
if (err)
goto free_root_inode;
+ err = f2fs_register_sysfs(sbi);
+ if (err)
+ goto free_compress_inode;
+
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount */
if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
@@ -4084,6 +4226,8 @@ free_meta:
/* evict some inodes being cached by GC */
evict_inodes(sb);
f2fs_unregister_sysfs(sbi);
+free_compress_inode:
+ f2fs_destroy_compress_inode(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
@@ -4162,6 +4306,15 @@ static void kill_f2fs_super(struct super_block *sb)
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ /*
+ * latter evict_inode() can bypass checking and invalidating
+ * compress inode cache.
+ */
+ if (test_opt(sbi, COMPRESS_CACHE))
+ truncate_inode_pages_final(COMPRESS_MAPPING(sbi));
+#endif
+
if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
struct cp_control cpc = {
@@ -4227,9 +4380,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_create_checkpoint_caches();
if (err)
goto free_segment_manager_caches;
- err = f2fs_create_extent_cache();
+ err = f2fs_create_recovery_cache();
if (err)
goto free_checkpoint_caches;
+ err = f2fs_create_extent_cache();
+ if (err)
+ goto free_recovery_cache;
err = f2fs_create_garbage_collection_cache();
if (err)
goto free_extent_cache;
@@ -4258,7 +4414,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_compress_cache();
if (err)
goto free_compress_mempool;
+ err = f2fs_create_casefold_cache();
+ if (err)
+ goto free_compress_cache;
return 0;
+free_compress_cache:
+ f2fs_destroy_compress_cache();
free_compress_mempool:
f2fs_destroy_compress_mempool();
free_bioset:
@@ -4278,6 +4439,8 @@ free_garbage_collection_cache:
f2fs_destroy_garbage_collection_cache();
free_extent_cache:
f2fs_destroy_extent_cache();
+free_recovery_cache:
+ f2fs_destroy_recovery_cache();
free_checkpoint_caches:
f2fs_destroy_checkpoint_caches();
free_segment_manager_caches:
@@ -4292,6 +4455,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ f2fs_destroy_casefold_cache();
f2fs_destroy_compress_cache();
f2fs_destroy_compress_mempool();
f2fs_destroy_bioset();
@@ -4303,6 +4467,7 @@ static void __exit exit_f2fs_fs(void)
f2fs_exit_sysfs();
f2fs_destroy_garbage_collection_cache();
f2fs_destroy_extent_cache();
+ f2fs_destroy_recovery_cache();
f2fs_destroy_checkpoint_caches();
f2fs_destroy_segment_manager_caches();
f2fs_destroy_node_manager_caches();
@@ -4315,4 +4480,5 @@ module_exit(exit_f2fs_fs)
MODULE_AUTHOR("Samsung Electronics's Praesto Team");
MODULE_DESCRIPTION("Flash Friendly File System");
MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32");
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 39b522ec73e7..6642246206bd 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -37,6 +37,7 @@ enum {
#endif
RESERVED_BLOCKS, /* struct f2fs_sb_info */
CPRC_INFO, /* struct ckpt_req_control */
+ ATGC_INFO, /* struct atgc_management */
};
struct f2fs_attr {
@@ -75,6 +76,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
#endif
else if (struct_type == CPRC_INFO)
return (unsigned char *)&sbi->cprc_info;
+ else if (struct_type == ATGC_INFO)
+ return (unsigned char *)&sbi->am;
return NULL;
}
@@ -155,6 +158,9 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_casefold(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
+ if (f2fs_sb_has_readonly(sbi))
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "readonly");
if (f2fs_sb_has_compression(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "compression");
@@ -495,6 +501,20 @@ out:
}
#endif
+ if (!strcmp(a->attr.name, "atgc_candidate_ratio")) {
+ if (t > 100)
+ return -EINVAL;
+ sbi->am.candidate_ratio = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "atgc_age_weight")) {
+ if (t > 100)
+ return -EINVAL;
+ sbi->am.age_weight = t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -546,46 +566,49 @@ static void f2fs_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
-enum feat_id {
- FEAT_CRYPTO = 0,
- FEAT_BLKZONED,
- FEAT_ATOMIC_WRITE,
- FEAT_EXTRA_ATTR,
- FEAT_PROJECT_QUOTA,
- FEAT_INODE_CHECKSUM,
- FEAT_FLEXIBLE_INLINE_XATTR,
- FEAT_QUOTA_INO,
- FEAT_INODE_CRTIME,
- FEAT_LOST_FOUND,
- FEAT_VERITY,
- FEAT_SB_CHECKSUM,
- FEAT_CASEFOLD,
- FEAT_COMPRESSION,
- FEAT_TEST_DUMMY_ENCRYPTION_V2,
-};
-
+/*
+ * Note that there are three feature list entries:
+ * 1) /sys/fs/f2fs/features
+ * : shows runtime features supported by in-kernel f2fs along with Kconfig.
+ * - ref. F2FS_FEATURE_RO_ATTR()
+ *
+ * 2) /sys/fs/f2fs/$s_id/features <deprecated>
+ * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This
+ * won't add new feature anymore, and thus, users should check entries in 3)
+ * instead of this 2).
+ *
+ * 3) /sys/fs/f2fs/$s_id/feature_list
+ * : shows on-disk features enabled by mkfs.f2fs per instance, which follows
+ * sysfs entry rule where each entry should expose single value.
+ * This list covers old feature list provided by 2) and beyond. Therefore,
+ * please add new on-disk feature in this list only.
+ * - ref. F2FS_SB_FEATURE_RO_ATTR()
+ */
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- switch (a->id) {
- case FEAT_CRYPTO:
- case FEAT_BLKZONED:
- case FEAT_ATOMIC_WRITE:
- case FEAT_EXTRA_ATTR:
- case FEAT_PROJECT_QUOTA:
- case FEAT_INODE_CHECKSUM:
- case FEAT_FLEXIBLE_INLINE_XATTR:
- case FEAT_QUOTA_INO:
- case FEAT_INODE_CRTIME:
- case FEAT_LOST_FOUND:
- case FEAT_VERITY:
- case FEAT_SB_CHECKSUM:
- case FEAT_CASEFOLD:
- case FEAT_COMPRESSION:
- case FEAT_TEST_DUMMY_ENCRYPTION_V2:
+ return sprintf(buf, "supported\n");
+}
+
+#define F2FS_FEATURE_RO_ATTR(_name) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = f2fs_feature_show, \
+}
+
+static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (F2FS_HAS_FEATURE(sbi, a->id))
return sprintf(buf, "supported\n");
- }
- return 0;
+ return sprintf(buf, "unsupported\n");
+}
+
+#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
+static struct f2fs_attr f2fs_attr_sb_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = f2fs_sb_feature_show, \
+ .id = F2FS_FEATURE_##_feat, \
}
#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
@@ -605,13 +628,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \
#define F2FS_GENERAL_RO_ATTR(name) \
static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
-#define F2FS_FEATURE_RO_ATTR(_name, _id) \
-static struct f2fs_attr f2fs_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = 0444 }, \
- .show = f2fs_feature_show, \
- .id = _id, \
-}
-
#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
@@ -685,31 +701,44 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks);
#endif
#ifdef CONFIG_FS_ENCRYPTION
-F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
-F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2);
+F2FS_FEATURE_RO_ATTR(encryption);
+F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2);
+#ifdef CONFIG_UNICODE
+F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif
+#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
-F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
+F2FS_FEATURE_RO_ATTR(block_zoned);
#endif
-F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
-F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
-F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
-F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
-F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
-F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
-F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME);
-F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND);
+F2FS_FEATURE_RO_ATTR(atomic_write);
+F2FS_FEATURE_RO_ATTR(extra_attr);
+F2FS_FEATURE_RO_ATTR(project_quota);
+F2FS_FEATURE_RO_ATTR(inode_checksum);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr);
+F2FS_FEATURE_RO_ATTR(quota_ino);
+F2FS_FEATURE_RO_ATTR(inode_crtime);
+F2FS_FEATURE_RO_ATTR(lost_found);
#ifdef CONFIG_FS_VERITY
-F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
+F2FS_FEATURE_RO_ATTR(verity);
#endif
-F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
-F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+F2FS_FEATURE_RO_ATTR(sb_checksum);
+#ifdef CONFIG_UNICODE
+F2FS_FEATURE_RO_ATTR(casefold);
+#endif
+F2FS_FEATURE_RO_ATTR(readonly);
#ifdef CONFIG_F2FS_FS_COMPRESSION
-F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+F2FS_FEATURE_RO_ATTR(compression);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode);
#endif
+F2FS_FEATURE_RO_ATTR(pin_file);
+
+/* For ATGC */
+F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio);
+F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count);
+F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight);
+F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -778,6 +807,11 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(compr_saved_block),
ATTR_LIST(compr_new_inode),
#endif
+ /* For ATGC */
+ ATTR_LIST(atgc_candidate_ratio),
+ ATTR_LIST(atgc_candidate_count),
+ ATTR_LIST(atgc_age_weight),
+ ATTR_LIST(atgc_age_threshold),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -786,7 +820,10 @@ static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
+#ifdef CONFIG_UNICODE
+ ATTR_LIST(encrypted_casefold),
#endif
+#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(block_zoned),
#endif
@@ -802,10 +839,14 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(verity),
#endif
ATTR_LIST(sb_checksum),
+#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
+#endif
+ ATTR_LIST(readonly),
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compression),
#endif
+ ATTR_LIST(pin_file),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
@@ -817,6 +858,40 @@ static struct attribute *f2fs_stat_attrs[] = {
};
ATTRIBUTE_GROUPS(f2fs_stat);
+F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT);
+F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED);
+F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR);
+F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA);
+F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM);
+F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
+F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO);
+F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME);
+F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND);
+F2FS_SB_FEATURE_RO_ATTR(verity, VERITY);
+F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM);
+F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
+F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
+F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
+
+static struct attribute *f2fs_sb_feat_attrs[] = {
+ ATTR_LIST(sb_encryption),
+ ATTR_LIST(sb_block_zoned),
+ ATTR_LIST(sb_extra_attr),
+ ATTR_LIST(sb_project_quota),
+ ATTR_LIST(sb_inode_checksum),
+ ATTR_LIST(sb_flexible_inline_xattr),
+ ATTR_LIST(sb_quota_ino),
+ ATTR_LIST(sb_inode_crtime),
+ ATTR_LIST(sb_lost_found),
+ ATTR_LIST(sb_verity),
+ ATTR_LIST(sb_sb_checksum),
+ ATTR_LIST(sb_casefold),
+ ATTR_LIST(sb_compression),
+ ATTR_LIST(sb_readonly),
+ NULL,
+};
+ATTRIBUTE_GROUPS(f2fs_sb_feat);
+
static const struct sysfs_ops f2fs_attr_ops = {
.show = f2fs_attr_show,
.store = f2fs_attr_store,
@@ -883,6 +958,33 @@ static struct kobj_type f2fs_stat_ktype = {
.release = f2fs_stat_kobj_release,
};
+static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_feature_list_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static void f2fs_feature_list_kobj_release(struct kobject *kobj)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_feature_list_kobj);
+ complete(&sbi->s_feature_list_kobj_unregister);
+}
+
+static const struct sysfs_ops f2fs_feature_list_attr_ops = {
+ .show = f2fs_sb_feat_attr_show,
+};
+
+static struct kobj_type f2fs_feature_list_ktype = {
+ .default_groups = f2fs_sb_feat_groups,
+ .sysfs_ops = &f2fs_feature_list_attr_ops,
+ .release = f2fs_feature_list_kobj_release,
+};
+
static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
void *offset)
{
@@ -1099,6 +1201,14 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
if (err)
goto put_stat_kobj;
+ sbi->s_feature_list_kobj.kset = &f2fs_kset;
+ init_completion(&sbi->s_feature_list_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_feature_list_kobj,
+ &f2fs_feature_list_ktype,
+ &sbi->s_kobj, "feature_list");
+ if (err)
+ goto put_feature_list_kobj;
+
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -1113,6 +1223,9 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
victim_bits_seq_show, sb);
}
return 0;
+put_feature_list_kobj:
+ kobject_put(&sbi->s_feature_list_kobj);
+ wait_for_completion(&sbi->s_feature_list_kobj_unregister);
put_stat_kobj:
kobject_put(&sbi->s_stat_kobj);
wait_for_completion(&sbi->s_stat_kobj_unregister);
@@ -1135,6 +1248,9 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
kobject_del(&sbi->s_stat_kobj);
kobject_put(&sbi->s_stat_kobj);
wait_for_completion(&sbi->s_stat_kobj_unregister);
+ kobject_del(&sbi->s_feature_list_kobj);
+ kobject_put(&sbi->s_feature_list_kobj);
+ wait_for_completion(&sbi->s_feature_list_kobj_unregister);
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index dfc72f15be7f..f946bec8f1f1 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -369,8 +369,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
/* 32-bit arches must use fcntl64() */
case F_OFD_SETLK:
case F_OFD_SETLKW:
-#endif
fallthrough;
+#endif
case F_SETLK:
case F_SETLKW:
if (copy_from_user(&flock, argp, sizeof(flock)))
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ec6feeccc276..6630c69c23a2 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
path_put(&path);
return fd;
}
- file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
+ file = file_open_root(&path, "", open_flag, 0);
if (IS_ERR(file)) {
put_unused_fd(fd);
retval = PTR_ERR(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 06d04a74ab6c..4c3370548982 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -521,6 +521,9 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
*/
smp_mb();
+ if (IS_DAX(inode))
+ return false;
+
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 2834d1afa6e8..de1985eae535 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -80,6 +80,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
}
/**
+ * vfs_parse_fs_param_source - Handle setting "source" via parameter
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * This is a simple helper for filesystems to verify that the "source" they
+ * accept is sane.
+ *
+ * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and
+ * -EINVAL otherwise. In the event of failure, supplementary error information
+ * is logged.
+ */
+int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
+{
+ if (strcmp(param->key, "source") != 0)
+ return -ENOPARAM;
+
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "Non-string source");
+
+ if (fc->source)
+ return invalf(fc, "Multiple sources");
+
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+}
+EXPORT_SYMBOL(vfs_parse_fs_param_source);
+
+/**
* vfs_parse_fs_param - Add a single parameter to a superblock config
* @fc: The filesystem context to modify
* @param: The parameter
@@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
/* If the filesystem doesn't take any arguments, give it the
* default handling of source.
*/
- if (strcmp(param->key, "source") == 0) {
- if (param->type != fs_value_is_string)
- return invalf(fc, "VFS: Non-string source");
- if (fc->source)
- return invalf(fc, "VFS: Multiple sources");
- fc->source = param->string;
- param->string = NULL;
- return 0;
- }
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
return invalf(fc, "%s: Unknown parameter '%s'",
fc->fs_type->name, param->key);
@@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct legacy_fs_context *ctx = fc->fs_private;
unsigned int size = ctx->data_size;
size_t len = 0;
+ int ret;
- if (strcmp(param->key, "source") == 0) {
- if (param->type != fs_value_is_string)
- return invalf(fc, "VFS: Legacy: Non-string source");
- if (fc->source)
- return invalf(fc, "VFS: Legacy: Multiple sources");
- fc->source = param->string;
- param->string = NULL;
- return 0;
- }
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index fb733eb5aead..9d58371d22c2 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -213,7 +213,7 @@ static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
dmap->writable = writable;
if (!upgrade) {
/*
- * We don't take a refernce on inode. inode is valid right now
+ * We don't take a reference on inode. inode is valid right now
* and when inode is going away, cleanup logic should first
* cleanup dmap entries.
*/
@@ -622,7 +622,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
}
/*
- * If read beyond end of file happnes, fs code seems to return
+ * If read beyond end of file happens, fs code seems to return
* it as hole
*/
iomap_hole:
@@ -1207,7 +1207,7 @@ static void fuse_dax_free_mem_worker(struct work_struct *work)
ret);
}
- /* If number of free ranges are still below threhold, requeue */
+ /* If number of free ranges are still below threshold, requeue */
kick_dmap_free_worker(fcd, 1);
}
@@ -1235,8 +1235,6 @@ void fuse_dax_conn_free(struct fuse_conn *fc)
static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
{
long nr_pages, nr_ranges;
- void *kaddr;
- pfn_t pfn;
struct fuse_dax_mapping *range;
int ret, id;
size_t dax_size = -1;
@@ -1248,8 +1246,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
id = dax_read_lock();
- nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr,
- &pfn);
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL,
+ NULL);
dax_read_unlock(id);
if (nr_pages < 0) {
pr_debug("dax_direct_access() returned %ld\n", nr_pages);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a5ceccc5ef00..1c8f79b3dd06 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -91,7 +91,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
{
/*
* lockess check of fc->connected is okay, because atomic_dec_and_test()
- * provides a memory barrier mached with the one in fuse_wait_aborted()
+ * provides a memory barrier matched with the one in fuse_wait_aborted()
* to ensure no wake-up is missed.
*/
if (atomic_dec_and_test(&fc->num_waiting) &&
@@ -783,6 +783,7 @@ static int fuse_check_page(struct page *page)
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
+ 1 << PG_workingset |
1 << PG_reclaim |
1 << PG_waiters))) {
dump_page(page, "fuse: trying to steal weird page");
@@ -1271,6 +1272,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
goto restart;
}
spin_lock(&fpq->lock);
+ /*
+ * Must not put request on fpq->io queue after having been shut down by
+ * fuse_abort_conn()
+ */
+ if (!fpq->connected) {
+ req->out.h.error = err = -ECONNABORTED;
+ goto out_end;
+
+ }
list_add(&req->list, &fpq->io);
spin_unlock(&fpq->lock);
cs->req = req;
@@ -1857,7 +1867,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
}
err = -EINVAL;
- if (oh.error <= -1000 || oh.error > 0)
+ if (oh.error <= -512 || oh.error > 0)
goto copy_finish;
spin_lock(&fpq->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1b6c001a7dd1..eade6f965b2e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (ret == -ENOMEM)
goto out;
if (ret || fuse_invalid_attr(&outarg.attr) ||
- inode_wrong_type(inode, outarg.attr.mode))
+ fuse_stale_inode(inode, outarg.generation, &outarg.attr))
goto invalid;
forget_all_cached_acls(inode);
@@ -309,68 +309,23 @@ static int fuse_dentry_delete(const struct dentry *dentry)
static struct vfsmount *fuse_dentry_automount(struct path *path)
{
struct fs_context *fsc;
- struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb);
- struct fuse_conn *fc = parent_fm->fc;
- struct fuse_mount *fm;
struct vfsmount *mnt;
struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
- struct super_block *sb;
- int err;
fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
- if (IS_ERR(fsc)) {
- err = PTR_ERR(fsc);
- goto out;
- }
-
- err = -ENOMEM;
- fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
- if (!fm)
- goto out_put_fsc;
-
- fsc->s_fs_info = fm;
- sb = sget_fc(fsc, NULL, set_anon_super_fc);
- if (IS_ERR(sb)) {
- err = PTR_ERR(sb);
- kfree(fm);
- goto out_put_fsc;
- }
- fm->fc = fuse_conn_get(fc);
-
- /* Initialize superblock, making @mp_fi its root */
- err = fuse_fill_super_submount(sb, mp_fi);
- if (err)
- goto out_put_sb;
+ if (IS_ERR(fsc))
+ return ERR_CAST(fsc);
- sb->s_flags |= SB_ACTIVE;
- fsc->root = dget(sb->s_root);
- /* We are done configuring the superblock, so unlock it */
- up_write(&sb->s_umount);
-
- down_write(&fc->killsb);
- list_add_tail(&fm->fc_entry, &fc->mounts);
- up_write(&fc->killsb);
+ /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */
+ fsc->fs_private = mp_fi;
/* Create the submount */
- mnt = vfs_create_mount(fsc);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
- goto out_put_fsc;
- }
- mntget(mnt);
- put_fs_context(fsc);
- return mnt;
+ mnt = fc_mount(fsc);
+ if (!IS_ERR(mnt))
+ mntget(mnt);
-out_put_sb:
- /*
- * Only jump here when fsc->root is NULL and sb is still locked
- * (otherwise put_fs_context() will put the superblock)
- */
- deactivate_locked_super(sb);
-out_put_fsc:
put_fs_context(fsc);
-out:
- return ERR_PTR(err);
+ return mnt;
}
const struct dentry_operations fuse_dentry_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 09ef2a4d25ed..97f860cfc195 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -645,7 +645,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
* == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
*
* An example:
- * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * User requested DIO read of 64K. It was split into two 32K fuse requests,
* both submitted asynchronously. The first of them was ACKed by userspace as
* fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
* second request was ACKed as short, e.g. only 1K was read, resulting in
@@ -1171,14 +1171,12 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+ tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
flush_dcache_page(page);
- iov_iter_advance(ii, tmp);
if (!tmp) {
unlock_page(page);
put_page(page);
- bytes = min(bytes, iov_iter_single_seg_count(ii));
goto again;
}
@@ -1405,7 +1403,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
nbytes += ret;
ret += start;
- npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+ npages = DIV_ROUND_UP(ret, PAGE_SIZE);
ap->descs[ap->num_pages].offset = start;
fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
@@ -2907,11 +2905,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
};
int err;
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & FALLOC_FL_PUNCH_HOLE);
+ (mode & (FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE));
bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (fm->fc->no_fallocate)
@@ -2926,7 +2926,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
loff_t endbyte = offset + length - 1;
err = fuse_writeback_range(inode, offset, endbyte);
@@ -2966,7 +2966,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
file_update_time(file);
}
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
truncate_pagecache_range(inode, offset, offset + length - 1);
fuse_invalidate_attr(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7e463e220053..07829ce78695 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -761,6 +761,9 @@ struct fuse_conn {
/* Auto-mount submounts announced by the server */
unsigned int auto_submounts:1;
+ /* Propagate syncfs() to server */
+ unsigned int sync_fs:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -867,6 +870,13 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
return atomic64_read(&fc->attr_version);
}
+static inline bool fuse_stale_inode(const struct inode *inode, int generation,
+ struct fuse_attr *attr)
+{
+ return inode->i_generation != generation ||
+ inode_wrong_type(inode, attr->mode);
+}
+
static inline void fuse_make_bad(struct inode *inode)
{
remove_inode_hash(inode);
@@ -1082,15 +1092,6 @@ void fuse_send_init(struct fuse_mount *fm);
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
/*
- * Fill in superblock for submounts
- * @sb: partially-initialized superblock to fill in
- * @parent_fi: The fuse_inode of the parent filesystem where this submount is
- * mounted
- */
-int fuse_fill_super_submount(struct super_block *sb,
- struct fuse_inode *parent_fi);
-
-/*
* Remove the mount from the connection
*
* Returns whether this was the last mount
@@ -1098,6 +1099,11 @@ int fuse_fill_super_submount(struct super_block *sb,
bool fuse_mount_remove(struct fuse_mount *fm);
/*
+ * Setup context ops for submounts
+ */
+int fuse_init_fs_context_submount(struct fs_context *fsc);
+
+/*
* Shut down the connection (possibly sending DESTROY request).
*/
void fuse_conn_destroy(struct fuse_mount *fm);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 393e36b74dc4..b9beb39a4a18 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -350,8 +350,8 @@ retry:
inode->i_generation = generation;
fuse_init_inode(inode, attr);
unlock_new_inode(inode);
- } else if (inode_wrong_type(inode, attr->mode)) {
- /* Inode has changed type, any I/O on the old should fail */
+ } else if (fuse_stale_inode(inode, generation, attr)) {
+ /* nodeid was reused, any I/O on the old inode should fail */
fuse_make_bad(inode);
iput(inode);
goto retry;
@@ -506,6 +506,45 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
return err;
}
+static int fuse_sync_fs(struct super_block *sb, int wait)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_syncfs_in inarg;
+ FUSE_ARGS(args);
+ int err;
+
+ /*
+ * Userspace cannot handle the wait == 0 case. Avoid a
+ * gratuitous roundtrip.
+ */
+ if (!wait)
+ return 0;
+
+ /* The filesystem is being unmounted. Nothing to do. */
+ if (!sb->s_root)
+ return 0;
+
+ if (!fc->sync_fs)
+ return 0;
+
+ memset(&inarg, 0, sizeof(inarg));
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.opcode = FUSE_SYNCFS;
+ args.nodeid = get_node_id(sb->s_root->d_inode);
+ args.out_numargs = 0;
+
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fc->sync_fs = 0;
+ err = 0;
+ }
+
+ return err;
+}
+
enum {
OPT_SOURCE,
OPT_SUBTYPE,
@@ -909,6 +948,7 @@ static const struct super_operations fuse_super_operations = {
.put_super = fuse_put_super,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
+ .sync_fs = fuse_sync_fs,
.show_options = fuse_show_options,
};
@@ -1275,8 +1315,8 @@ static void fuse_sb_defaults(struct super_block *sb)
sb->s_xattr = fuse_no_acl_xattr_handlers;
}
-int fuse_fill_super_submount(struct super_block *sb,
- struct fuse_inode *parent_fi)
+static int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi)
{
struct fuse_mount *fm = get_fuse_mount_super(sb);
struct super_block *parent_sb = parent_fi->inode.i_sb;
@@ -1313,6 +1353,58 @@ int fuse_fill_super_submount(struct super_block *sb,
return 0;
}
+/* Filesystem context private data holds the FUSE inode of the mount point */
+static int fuse_get_tree_submount(struct fs_context *fsc)
+{
+ struct fuse_mount *fm;
+ struct fuse_inode *mp_fi = fsc->fs_private;
+ struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode);
+ struct super_block *sb;
+ int err;
+
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ return -ENOMEM;
+
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, NULL, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ kfree(fm);
+ return PTR_ERR(sb);
+ }
+ fm->fc = fuse_conn_get(fc);
+
+ /* Initialize superblock, making @mp_fi its root */
+ err = fuse_fill_super_submount(sb, mp_fi);
+ if (err) {
+ fuse_conn_put(fc);
+ kfree(fm);
+ sb->s_fs_info = NULL;
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ down_write(&fc->killsb);
+ list_add_tail(&fm->fc_entry, &fc->mounts);
+ up_write(&fc->killsb);
+
+ sb->s_flags |= SB_ACTIVE;
+ fsc->root = dget(sb->s_root);
+
+ return 0;
+}
+
+static const struct fs_context_operations fuse_context_submount_ops = {
+ .get_tree = fuse_get_tree_submount,
+};
+
+int fuse_init_fs_context_submount(struct fs_context *fsc)
+{
+ fsc->ops = &fuse_context_submount_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount);
+
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
struct fuse_dev *fud = NULL;
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 277f7041d55a..bc267832310c 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -200,9 +200,12 @@ retry:
if (!d_in_lookup(dentry)) {
struct fuse_inode *fi;
inode = d_inode(dentry);
+ if (inode && get_node_id(inode) != o->nodeid)
+ inode = NULL;
if (!inode ||
- get_node_id(inode) != o->nodeid ||
- inode_wrong_type(inode, o->attr.mode)) {
+ fuse_stale_inode(inode, o->generation, &o->attr)) {
+ if (inode)
+ fuse_make_bad(inode);
d_invalidate(dentry);
dput(dentry);
goto retry;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index bcb8a02e2d8b..8f52cdaa8445 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1447,6 +1447,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
fc->release = fuse_free_conn;
fc->delete_stale = true;
fc->auto_submounts = true;
+ fc->sync_fs = true;
/* Tell FUSE to split requests that exceed the virtqueue's size */
fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,
@@ -1496,6 +1497,9 @@ static int virtio_fs_init_fs_context(struct fs_context *fsc)
{
struct fuse_fs_context *ctx;
+ if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT)
+ return fuse_init_fs_context_submount(fsc);
+
ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4af318fbda77..ef9498a6e88a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -25,7 +25,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->key = ptr + tree->max_key_len + 2;
hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
tree->cnid, __builtin_return_address(0));
- mutex_lock(&tree->tree_lock);
+ switch (tree->cnid) {
+ case HFS_CAT_CNID:
+ mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
+ break;
+ case HFS_EXT_CNID:
+ mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
+ break;
+ case HFS_ATTR_CNID:
+ mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
+ break;
+ default:
+ return -EINVAL;
+ }
return 0;
}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index b63a4df7327b..c0a73a6ffb28 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -15,16 +15,31 @@
#include "btree.h"
-void hfs_bnode_read(struct hfs_bnode *node, void *buf,
- int off, int len)
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page *page;
+ int pagenum;
+ int bytes_read;
+ int bytes_to_read;
+ void *vaddr;
off += node->page_offset;
- page = node->page[0];
+ pagenum = off >> PAGE_SHIFT;
+ off &= ~PAGE_MASK; /* compute page offset for the first page */
- memcpy(buf, kmap(page) + off, len);
- kunmap(page);
+ for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) {
+ if (pagenum >= node->tree->pages_per_bnode)
+ break;
+ page = node->page[pagenum];
+ bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);
+
+ vaddr = kmap_atomic(page);
+ memcpy(buf + bytes_read, vaddr + off, bytes_to_read);
+ kunmap_atomic(vaddr);
+
+ pagenum++;
+ off = 0; /* page offset only applies to the first page */
+ }
}
u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index 4ba45caf5939..0e6baee93245 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -13,6 +13,13 @@ typedef int (*btree_keycmp)(const btree_key *, const btree_key *);
#define NODE_HASH_SIZE 256
+/* B-tree mutex nested subclasses */
+enum hfs_btree_mutex_classes {
+ CATALOG_BTREE_MUTEX,
+ EXTENTS_BTREE_MUTEX,
+ ATTR_BTREE_MUTEX,
+};
+
/* A HFS BTree held in memory */
struct hfs_btree {
struct super_block *sb;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 44d07c9e3a7f..12d9bae39363 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -420,14 +420,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
if (!res) {
if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
res = -EIO;
- goto bail;
+ goto bail_hfs_find;
}
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
}
- if (res) {
- hfs_find_exit(&fd);
- goto bail_no_root;
- }
+ if (res)
+ goto bail_hfs_find;
res = -EINVAL;
root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
hfs_find_exit(&fd);
@@ -443,6 +441,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
/* everything's okay */
return 0;
+bail_hfs_find:
+ hfs_find_exit(&fd);
bail_no_root:
pr_err("get root inode failed\n");
bail:
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 70e8374ddac4..6fef67c2a9f0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -281,6 +281,11 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = hfsp_mt2ut(hip->create_date);
+ }
+
if (inode->i_flags & S_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
if (inode->i_flags & S_IMMUTABLE)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4d169c5a2673..e2855ceefd39 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -204,7 +204,6 @@ check_attr_tree_state_again:
buf = kzalloc(node_size, GFP_NOFS);
if (!buf) {
- pr_err("failed to allocate memory for header node\n");
err = -ENOMEM;
goto end_attr_file_creation;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 926eeb9bf4eb..cdfb1ae78a3f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -77,7 +77,7 @@ enum hugetlb_param {
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
fsparam_u32 ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
- fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
diff --git a/fs/internal.h b/fs/internal.h
index 6aeae7ef3380..82e8eb32ff3d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,7 +61,6 @@ extern void __init chrdev_init(void);
*/
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
-extern void fc_drop_locked(struct fs_context *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);
@@ -129,7 +128,7 @@ struct open_flags {
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
-extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 843d4a7bcd6e..7d2ed8c7dd31 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -129,7 +129,8 @@ struct io_cb_cancel_data {
bool cancel_all;
};
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first);
+static void io_wqe_dec_running(struct io_worker *worker);
static bool io_worker_get(struct io_worker *worker)
{
@@ -168,26 +169,21 @@ static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- unsigned flags;
if (refcount_dec_and_test(&worker->ref))
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
- preempt_disable();
- current->flags &= ~PF_IO_WORKER;
- flags = worker->flags;
- worker->flags = 0;
- if (flags & IO_WORKER_F_RUNNING)
- atomic_dec(&acct->nr_running);
- worker->flags = 0;
- preempt_enable();
-
raw_spin_lock_irq(&wqe->lock);
- if (flags & IO_WORKER_F_FREE)
+ if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
acct->nr_workers--;
+ preempt_disable();
+ io_wqe_dec_running(worker);
+ worker->flags = 0;
+ current->flags &= ~PF_IO_WORKER;
+ preempt_enable();
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
@@ -214,15 +210,19 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
struct hlist_nulls_node *n;
struct io_worker *worker;
- n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
- if (is_a_nulls(n))
- return false;
-
- worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
- if (io_worker_get(worker)) {
- wake_up_process(worker->task);
+ /*
+ * Iterate free_list and see if we can find an idle worker to
+ * activate. If a given worker is on the free_list but in the process
+ * of exiting, keep trying.
+ */
+ hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+ if (!io_worker_get(worker))
+ continue;
+ if (wake_up_process(worker->task)) {
+ io_worker_release(worker);
+ return true;
+ }
io_worker_release(worker);
- return true;
}
return false;
@@ -247,10 +247,21 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
ret = io_wqe_activate_free_worker(wqe);
rcu_read_unlock();
- if (!ret && acct->nr_workers < acct->max_workers) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- create_io_worker(wqe->wq, wqe, acct->index);
+ if (!ret) {
+ bool do_create = false, first = false;
+
+ raw_spin_lock_irq(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ if (!acct->nr_workers)
+ first = true;
+ acct->nr_workers++;
+ do_create = true;
+ }
+ raw_spin_unlock_irq(&wqe->lock);
+ if (do_create)
+ create_io_worker(wqe->wq, wqe, acct->index, first);
}
}
@@ -271,10 +282,28 @@ static void create_worker_cb(struct callback_head *cb)
{
struct create_worker_data *cwd;
struct io_wq *wq;
+ struct io_wqe *wqe;
+ struct io_wqe_acct *acct;
+ bool do_create = false, first = false;
cwd = container_of(cb, struct create_worker_data, work);
- wq = cwd->wqe->wq;
- create_io_worker(wq, cwd->wqe, cwd->index);
+ wqe = cwd->wqe;
+ wq = wqe->wq;
+ acct = &wqe->acct[cwd->index];
+ raw_spin_lock_irq(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ if (!acct->nr_workers)
+ first = true;
+ acct->nr_workers++;
+ do_create = true;
+ }
+ raw_spin_unlock_irq(&wqe->lock);
+ if (do_create) {
+ create_io_worker(wq, wqe, cwd->index, first);
+ } else {
+ atomic_dec(&acct->nr_running);
+ io_worker_ref_put(wq);
+ }
kfree(cwd);
}
@@ -612,7 +641,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
raw_spin_unlock_irq(&worker->wqe->lock);
}
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
{
struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
@@ -635,6 +664,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
kfree(worker);
fail:
atomic_dec(&acct->nr_running);
+ raw_spin_lock_irq(&wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock_irq(&wqe->lock);
io_worker_ref_put(wq);
return;
}
@@ -650,9 +682,8 @@ fail:
worker->flags |= IO_WORKER_F_FREE;
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
- if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
+ if (first && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED;
- acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
wake_up_new_task(tsk);
}
@@ -731,7 +762,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
int work_flags;
unsigned long flags;
- if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) {
+ /*
+ * If io-wq is exiting for this task, or if the request has explicitly
+ * been marked as one that should not get executed, cancel it here.
+ */
+ if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
+ (work->flags & IO_WQ_WORK_CANCEL)) {
io_run_cancel(work, wqe);
return;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e55b21fc0ab2..a2e20a6fbfed 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
+#include <linux/tracehook.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -465,7 +466,8 @@ struct io_ring_ctx {
struct mm_struct *mm_account;
/* ctx exit and cancelation */
- struct callback_head *exit_task_work;
+ struct llist_head fallback_llist;
+ struct delayed_work fallback_work;
struct work_struct exit_work;
struct list_head tctx_list;
struct completion ref_comp;
@@ -784,9 +786,14 @@ struct async_poll {
struct io_poll_iocb *double_poll;
};
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
+
struct io_task_work {
- struct io_wq_work_node node;
- task_work_func_t func;
+ union {
+ struct io_wq_work_node node;
+ struct llist_node fallback_node;
+ };
+ io_req_tw_func_t func;
};
enum {
@@ -849,10 +856,7 @@ struct io_kiocb {
/* used with ctx->iopoll_list with reads/writes */
struct list_head inflight_entry;
- union {
- struct io_task_work io_task_work;
- struct callback_head task_work;
- };
+ struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
struct async_poll *apoll;
@@ -1071,6 +1075,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
+static void io_fallback_req_func(struct work_struct *unused);
+
static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
@@ -1202,6 +1208,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
INIT_LIST_HEAD(&ctx->locked_free_list);
+ INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -1273,8 +1280,17 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_for_each_link(cur, req)
- io_prep_async_work(cur);
+ if (req->flags & REQ_F_LINK_TIMEOUT) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
+ spin_unlock_irq(&ctx->completion_lock);
+ } else {
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
+ }
}
static void io_queue_async_work(struct io_kiocb *req)
@@ -1288,6 +1304,17 @@ static void io_queue_async_work(struct io_kiocb *req)
/* init ->work of the whole link before punting */
io_prep_async_link(req);
+
+ /*
+ * Not expected to happen, but if we do have a bug where this _can_
+ * happen, catch it here and ensure the request is marked as
+ * canceled. That will make io-wq go through the usual work cancel
+ * procedure rather than attempt to run this request (or create a new
+ * worker for it).
+ */
+ if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
io_wq_enqueue(tctx->io_wq, &req->work);
@@ -1473,7 +1500,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
}
if (posted)
@@ -1552,7 +1580,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
@@ -1929,13 +1959,17 @@ static void tctx_task_work(struct callback_head *cb)
ctx = req->ctx;
percpu_ref_get(&ctx->refs);
}
- req->task_work.func(&req->task_work);
+ req->io_task_work.func(req);
node = next;
}
if (wq_list_empty(&tctx->task_list)) {
+ spin_lock_irq(&tctx->task_lock);
clear_bit(0, &tctx->task_state);
- if (wq_list_empty(&tctx->task_list))
+ if (wq_list_empty(&tctx->task_list)) {
+ spin_unlock_irq(&tctx->task_lock);
break;
+ }
+ spin_unlock_irq(&tctx->task_lock);
/* another tctx_task_work() is enqueued, yield */
if (test_and_set_bit(0, &tctx->task_state))
break;
@@ -1946,17 +1980,13 @@ static void tctx_task_work(struct callback_head *cb)
ctx_flush_and_put(ctx);
}
-static int io_req_task_work_add(struct io_kiocb *req)
+static void io_req_task_work_add(struct io_kiocb *req)
{
struct task_struct *tsk = req->task;
struct io_uring_task *tctx = tsk->io_uring;
enum task_work_notify_mode notify;
- struct io_wq_work_node *node, *prev;
+ struct io_wq_work_node *node;
unsigned long flags;
- int ret = 0;
-
- if (unlikely(tsk->flags & PF_EXITING))
- return -ESRCH;
WARN_ON_ONCE(!tctx);
@@ -1967,7 +1997,9 @@ static int io_req_task_work_add(struct io_kiocb *req)
/* task_work already pending, we're done */
if (test_bit(0, &tctx->task_state) ||
test_and_set_bit(0, &tctx->task_state))
- return 0;
+ return;
+ if (unlikely(tsk->flags & PF_EXITING))
+ goto fail;
/*
* SQPOLL kernel thread doesn't need notification, just a wakeup. For
@@ -1976,72 +2008,28 @@ static int io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
-
if (!task_work_add(tsk, &tctx->task_work, notify)) {
wake_up_process(tsk);
- return 0;
+ return;
}
-
- /*
- * Slow path - we failed, find and delete work. if the work is not
- * in the list, it got run and we're fine.
- */
+fail:
+ clear_bit(0, &tctx->task_state);
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_for_each(node, prev, &tctx->task_list) {
- if (&req->io_task_work.node == node) {
- wq_list_del(&tctx->task_list, node, prev);
- ret = 1;
- break;
- }
- }
+ node = tctx->task_list.first;
+ INIT_WQ_LIST(&tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
- clear_bit(0, &tctx->task_state);
- return ret;
-}
-
-static bool io_run_task_work_head(struct callback_head **work_head)
-{
- struct callback_head *work, *next;
- bool executed = false;
-
- do {
- work = xchg(work_head, NULL);
- if (!work)
- break;
-
- do {
- next = work->next;
- work->func(work);
- work = next;
- cond_resched();
- } while (work);
- executed = true;
- } while (1);
- return executed;
-}
-
-static void io_task_work_add_head(struct callback_head **work_head,
- struct callback_head *task_work)
-{
- struct callback_head *head;
-
- do {
- head = READ_ONCE(*work_head);
- task_work->next = head;
- } while (cmpxchg(work_head, head, task_work) != head);
-}
-
-static void io_req_task_work_add_fallback(struct io_kiocb *req,
- task_work_func_t cb)
-{
- init_task_work(&req->task_work, cb);
- io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
+ while (node) {
+ req = container_of(node, struct io_kiocb, io_task_work.node);
+ node = node->next;
+ if (llist_add(&req->io_task_work.fallback_node,
+ &req->ctx->fallback_llist))
+ schedule_delayed_work(&req->ctx->fallback_work, 1);
+ }
}
-static void io_req_task_cancel(struct callback_head *cb)
+static void io_req_task_cancel(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
/* ctx is guaranteed to stay alive while we hold uring_lock */
@@ -2050,41 +2038,36 @@ static void io_req_task_cancel(struct callback_head *cb)
mutex_unlock(&ctx->uring_lock);
}
-static void __io_req_task_submit(struct io_kiocb *req)
+static void io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
mutex_lock(&ctx->uring_lock);
- if (!(current->flags & PF_EXITING) && !current->in_execve)
+ if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
__io_queue_sqe(req);
else
io_req_complete_failed(req, -EFAULT);
mutex_unlock(&ctx->uring_lock);
}
-static void io_req_task_submit(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
-
- __io_req_task_submit(req);
-}
-
static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
req->result = ret;
- req->task_work.func = io_req_task_cancel;
-
- if (unlikely(io_req_task_work_add(req)))
- io_req_task_work_add_fallback(req, io_req_task_cancel);
+ req->io_task_work.func = io_req_task_cancel;
+ io_req_task_work_add(req);
}
static void io_req_task_queue(struct io_kiocb *req)
{
- req->task_work.func = io_req_task_submit;
+ req->io_task_work.func = io_req_task_submit;
+ io_req_task_work_add(req);
+}
- if (unlikely(io_req_task_work_add(req)))
- io_req_task_queue_fail(req, -ECANCELED);
+static void io_req_task_queue_reissue(struct io_kiocb *req)
+{
+ req->io_task_work.func = io_queue_async_work;
+ io_req_task_work_add(req);
}
static inline void io_queue_next(struct io_kiocb *req)
@@ -2195,18 +2178,10 @@ static inline void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_put_req_deferred_cb(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
-
- io_free_req(req);
-}
-
static void io_free_req_deferred(struct io_kiocb *req)
{
- req->task_work.func = io_put_req_deferred_cb;
- if (unlikely(io_req_task_work_add(req)))
- io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
+ req->io_task_work.func = io_free_req;
+ io_req_task_work_add(req);
}
static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
@@ -2251,9 +2226,9 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
- if (current->task_works) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
__set_current_state(TASK_RUNNING);
- task_work_run();
+ tracehook_notify_signal();
return true;
}
@@ -2264,7 +2239,7 @@ static inline bool io_run_task_work(void)
* Find and free completed poll iocbs
*/
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+ struct list_head *done, bool resubmit)
{
struct req_batch rb;
struct io_kiocb *req;
@@ -2279,11 +2254,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
- if (READ_ONCE(req->result) == -EAGAIN &&
+ if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
!(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
req_ref_get(req);
- io_queue_async_work(req);
+ io_req_task_queue_reissue(req);
continue;
}
@@ -2303,7 +2278,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
+ long min, bool resubmit)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
@@ -2346,7 +2321,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ io_iopoll_complete(ctx, nr_events, &done, resubmit);
return ret;
}
@@ -2364,7 +2339,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
while (!list_empty(&ctx->iopoll_list)) {
unsigned int nr_events = 0;
- io_do_iopoll(ctx, &nr_events, 0);
+ io_do_iopoll(ctx, &nr_events, 0, false);
/* let it sleep and repeat later if can't complete a request */
if (nr_events == 0)
@@ -2415,14 +2390,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* very same mutex.
*/
if (list_empty(&ctx->iopoll_list)) {
+ u32 tail = ctx->cached_cq_tail;
+
mutex_unlock(&ctx->uring_lock);
io_run_task_work();
mutex_lock(&ctx->uring_lock);
- if (list_empty(&ctx->iopoll_list))
+ /* some requests don't go through iopoll_list */
+ if (tail != ctx->cached_cq_tail ||
+ list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
+ ret = io_do_iopoll(ctx, &nr_events, min, true);
} while (!ret && nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
@@ -2472,6 +2451,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
*/
if (percpu_ref_is_dying(&ctx->refs))
return false;
+ /*
+ * Play it safe and assume not safe to re-import and reissue if we're
+ * not in the original thread group (or in task context).
+ */
+ if (!same_thread_group(req->task, current) || !in_task())
+ return false;
return true;
}
#else
@@ -2485,6 +2470,19 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
}
#endif
+static void io_fallback_req_func(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+ fallback_work.work);
+ struct llist_node *node = llist_del_all(&ctx->fallback_llist);
+ struct io_kiocb *req, *tmp;
+
+ percpu_ref_get(&ctx->refs);
+ llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
+ req->io_task_work.func(req);
+ percpu_ref_put(&ctx->refs);
+}
+
static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
unsigned int issue_flags)
{
@@ -2791,7 +2789,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
req_ref_get(req);
- io_queue_async_work(req);
+ io_req_task_queue_reissue(req);
} else {
int cflags = 0;
@@ -4846,14 +4844,13 @@ IO_NETOP_FN(recv);
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
+ int nr_entries;
int error;
};
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
- __poll_t mask, task_work_func_t func)
+ __poll_t mask, io_req_tw_func_t func)
{
- int ret;
-
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
@@ -4863,7 +4860,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
list_del_init(&poll->wait.entry);
req->result = mask;
- req->task_work.func = func;
+ req->io_task_work.func = func;
/*
* If this fails, then the task is exiting. When a task exits, the
@@ -4871,11 +4868,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req);
- if (unlikely(ret)) {
- WRITE_ONCE(poll->canceled, true);
- io_req_task_work_add_fallback(req, func);
- }
+ io_req_task_work_add(req);
return 1;
}
@@ -4884,6 +4877,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
{
struct io_ring_ctx *ctx = req->ctx;
+ if (unlikely(req->task->flags & PF_EXITING))
+ WRITE_ONCE(poll->canceled, true);
+
if (!req->result && !READ_ONCE(poll->canceled)) {
struct poll_table_struct pt = { ._key = poll->events };
@@ -4949,7 +4945,6 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
if (req->poll.events & EPOLLONESHOT)
flags = 0;
if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
- io_poll_remove_waitqs(req);
req->poll.done = true;
flags = 0;
}
@@ -4960,9 +4955,8 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
return !(flags & IORING_CQE_F_MORE);
}
-static void io_poll_task_func(struct callback_head *cb)
+static void io_poll_task_func(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt;
@@ -4973,6 +4967,7 @@ static void io_poll_task_func(struct callback_head *cb)
done = io_poll_complete(req, req->result);
if (done) {
+ io_poll_remove_double(req);
hash_del(&req->hash_node);
} else {
req->result = 0;
@@ -4984,7 +4979,7 @@ static void io_poll_task_func(struct callback_head *cb)
if (done) {
nxt = io_put_req_find_next(req);
if (nxt)
- __io_req_task_submit(nxt);
+ io_req_task_submit(nxt);
}
}
}
@@ -5004,7 +4999,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
list_del_init(&wait->entry);
- if (poll && poll->head) {
+ if (poll->head) {
bool done;
spin_lock(&poll->head->lock);
@@ -5043,11 +5038,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
struct io_kiocb *req = pt->req;
/*
- * If poll->head is already set, it's because the file being polled
- * uses multiple waitqueues for poll handling (eg one for read, one
- * for write). Setup a separate io_poll_iocb if this happens.
+ * The file being polled uses multiple waitqueues for poll handling
+ * (e.g. one for read, one for write). Setup a separate io_poll_iocb
+ * if this happens.
*/
- if (unlikely(poll->head)) {
+ if (unlikely(pt->nr_entries)) {
struct io_poll_iocb *poll_one = poll;
/* already have a 2nd entry, fail a third attempt */
@@ -5075,7 +5070,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*poll_ptr = poll;
}
- pt->error = 0;
+ pt->nr_entries++;
poll->head = head;
if (poll->events & EPOLLEXCLUSIVE)
@@ -5093,9 +5088,8 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
-static void io_async_task_func(struct callback_head *cb)
+static void io_async_task_func(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
@@ -5111,7 +5105,7 @@ static void io_async_task_func(struct callback_head *cb)
spin_unlock_irq(&ctx->completion_lock);
if (!READ_ONCE(apoll->poll.canceled))
- __io_req_task_submit(req);
+ io_req_task_submit(req);
else
io_req_complete_failed(req, -ECANCELED);
}
@@ -5153,11 +5147,16 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
ipt->pt._key = mask;
ipt->req = req;
- ipt->error = -EINVAL;
+ ipt->error = 0;
+ ipt->nr_entries = 0;
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+ if (unlikely(!ipt->nr_entries) && !ipt->error)
+ ipt->error = -EINVAL;
spin_lock_irq(&ctx->completion_lock);
+ if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
+ io_poll_remove_double(req);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
if (unlikely(list_empty(&poll->wait.entry))) {
@@ -5228,7 +5227,6 @@ static int io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret || ipt.error) {
- io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
if (ret)
return IO_APOLL_READY;
@@ -6068,10 +6066,12 @@ static bool io_drain_req(struct io_kiocb *req)
ret = io_req_prep_async(req);
if (ret)
- return ret;
+ goto fail;
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
+ ret = -ENOMEM;
+fail:
io_req_complete_failed(req, ret);
return true;
}
@@ -6809,14 +6809,16 @@ static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
@@ -6839,7 +6841,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ io_do_iopoll(ctx, &nr_events, 0, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7138,16 +7140,6 @@ static void **io_alloc_page_table(size_t size)
return table;
}
-static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
-{
- spin_lock_bh(&ctx->rsrc_ref_lock);
-}
-
-static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
-{
- spin_unlock_bh(&ctx->rsrc_ref_lock);
-}
-
static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
@@ -7164,9 +7156,9 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
rsrc_node->rsrc_data = data_to_kill;
- io_rsrc_ref_lock(ctx);
+ spin_lock_irq(&ctx->rsrc_ref_lock);
list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
- io_rsrc_ref_unlock(ctx);
+ spin_unlock_irq(&ctx->rsrc_ref_lock);
atomic_inc(&data_to_kill->refs);
percpu_ref_kill(&rsrc_node->refs);
@@ -7205,17 +7197,19 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
/* kill initial ref, already quiesced if zero */
if (atomic_dec_and_test(&data->refs))
break;
+ mutex_unlock(&ctx->uring_lock);
flush_delayed_work(&ctx->rsrc_put_work);
ret = wait_for_completion_interruptible(&data->done);
- if (!ret)
+ if (!ret) {
+ mutex_lock(&ctx->uring_lock);
break;
+ }
atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
- mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
mutex_lock(&ctx->uring_lock);
} while (ret >= 0);
@@ -7674,9 +7668,10 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+ unsigned long flags;
bool first_add = false;
- io_rsrc_ref_lock(ctx);
+ spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;
while (!list_empty(&ctx->rsrc_ref_list)) {
@@ -7688,7 +7683,7 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
list_del(&node->node);
first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
- io_rsrc_ref_unlock(ctx);
+ spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (first_add)
mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
@@ -7946,15 +7941,19 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_wq_data data;
unsigned int concurrency;
+ mutex_lock(&ctx->uring_lock);
hash = ctx->hash_map;
if (!hash) {
hash = kzalloc(sizeof(*hash), GFP_KERNEL);
- if (!hash)
+ if (!hash) {
+ mutex_unlock(&ctx->uring_lock);
return ERR_PTR(-ENOMEM);
+ }
refcount_set(&hash->refs, 1);
init_waitqueue_head(&hash->wait);
ctx->hash_map = hash;
}
+ mutex_unlock(&ctx->uring_lock);
data.hash = hash;
data.task = task;
@@ -8028,9 +8027,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
f = fdget(p->wq_fd);
if (!f.file)
return -ENXIO;
- fdput(f);
- if (f.file->f_op != &io_uring_fops)
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
return -EINVAL;
+ }
+ fdput(f);
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct task_struct *tsk;
@@ -8653,13 +8654,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static bool io_wait_rsrc_data(struct io_rsrc_data *data)
+static void io_wait_rsrc_data(struct io_rsrc_data *data)
{
- if (!data)
- return false;
- if (!atomic_dec_and_test(&data->refs))
+ if (data && !atomic_dec_and_test(&data->refs))
wait_for_completion(&data->done);
- return true;
}
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -8671,10 +8669,14 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
+ io_wait_rsrc_data(ctx->buf_data);
+ io_wait_rsrc_data(ctx->file_data);
+
mutex_lock(&ctx->uring_lock);
- if (io_wait_rsrc_data(ctx->buf_data))
+ if (ctx->buf_data)
__io_sqe_buffers_unregister(ctx);
- if (io_wait_rsrc_data(ctx->file_data))
+ if (ctx->file_data)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
@@ -8767,11 +8769,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
return -EINVAL;
}
-static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
-{
- return io_run_task_work_head(&ctx->exit_task_work);
-}
-
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
@@ -8837,7 +8834,7 @@ static void io_ring_exit_work(struct work_struct *work)
/*
* Some may use context even when all refs and requests have been put,
* and they are free to do so while still holding uring_lock or
- * completion_lock, see __io_req_task_submit(). Apart from other work,
+ * completion_lock, see io_req_task_submit(). Apart from other work,
* this lock/unlock section also waits them to finish.
*/
mutex_lock(&ctx->uring_lock);
@@ -9036,7 +9033,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
ret |= io_run_task_work();
- ret |= io_run_ctx_fallback(ctx);
if (!ret)
break;
cond_resched();
@@ -9376,9 +9372,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (ctx->flags & IORING_SETUP_SQPOLL) {
io_cqring_overflow_flush(ctx, false);
- ret = -EOWNERDEAD;
- if (unlikely(ctx->sq_data->thread == NULL))
+ if (unlikely(ctx->sq_data->thread == NULL)) {
+ ret = -EOWNERDEAD;
goto out;
+ }
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT) {
@@ -9846,10 +9843,11 @@ static int io_register_personality(struct io_ring_ctx *ctx)
ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (!ret)
- return id;
- put_cred(creds);
- return ret;
+ if (ret < 0) {
+ put_cred(creds);
+ return ret;
+ }
+ return id;
}
static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0065781935c7..87ccb3438bec 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -215,6 +215,7 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
if (PageUptodate(page))
return;
+ BUG_ON(page_has_private(page));
BUG_ON(page->index);
BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
@@ -239,7 +240,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
{
struct iomap_readpage_ctx *ctx = data;
struct page *page = ctx->cur_page;
- struct iomap_page *iop = iomap_page_create(inode, page);
+ struct iomap_page *iop;
bool same_page = false, is_contig = false;
loff_t orig_pos = pos;
unsigned poff, plen;
@@ -252,6 +253,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
/* zero post-eof blocks as the page may be mapped */
+ iop = iomap_page_create(inode, page);
iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
@@ -746,10 +748,6 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
@@ -764,30 +762,29 @@ again:
if (mapping_writably_mapped(inode->i_mapping))
flush_dcache_page(page);
- copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ copied = copy_page_from_iter_atomic(page, offset, bytes, i);
- copied = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+ status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
srcmap);
- cond_resched();
+ if (unlikely(copied != status))
+ iov_iter_revert(i, copied - status);
- iov_iter_advance(i, copied);
- if (unlikely(copied == 0)) {
+ cond_resched();
+ if (unlikely(status == 0)) {
/*
- * If we were unable to copy any data at all, we must
- * fall back to a single segment length write.
- *
- * If we didn't fallback here, we could livelock
- * because not all segments in the iov can be copied at
- * once without a pagefault.
+ * A short copy made iomap_write_end() reject the
+ * thing entirely. Might be memory poisoning
+ * halfway through, might be a race with munmap,
+ * might be severe memory pressure.
*/
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_single_seg_count(i));
+ if (copied)
+ bytes = copied;
goto again;
}
- pos += copied;
- written += copied;
- length -= copied;
+ pos += status;
+ written += status;
+ length -= status;
balance_dirty_pages_ratelimited(inode->i_mapping);
} while (iov_iter_count(i) && length);
@@ -972,7 +969,6 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
block_commit_write(page, 0, length);
} else {
WARN_ON_ONCE(!PageUptodate(page));
- iomap_page_create(inode, page);
set_page_dirty(page);
}
@@ -1309,14 +1305,13 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct page *page, u64 end_offset)
{
- struct iomap_page *iop = to_iomap_page(page);
+ struct iomap_page *iop = iomap_page_create(inode, page);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
u64 file_offset; /* file offset of page */
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
- WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
/*
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index dab1b02eba5b..ce6fb810854f 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -35,23 +35,20 @@ loff_t
iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
{
loff_t size = i_size_read(inode);
- loff_t length = size - offset;
loff_t ret;
/* Nothing to be found before or beyond the end of the file. */
if (offset < 0 || offset >= size)
return -ENXIO;
- while (length > 0) {
- ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
- &offset, iomap_seek_hole_actor);
+ while (offset < size) {
+ ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT,
+ ops, &offset, iomap_seek_hole_actor);
if (ret < 0)
return ret;
if (ret == 0)
break;
-
offset += ret;
- length -= ret;
}
return offset;
@@ -83,27 +80,23 @@ loff_t
iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
{
loff_t size = i_size_read(inode);
- loff_t length = size - offset;
loff_t ret;
/* Nothing to be found before or beyond the end of the file. */
if (offset < 0 || offset >= size)
return -ENXIO;
- while (length > 0) {
- ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
- &offset, iomap_seek_data_actor);
+ while (offset < size) {
+ ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT,
+ ops, &offset, iomap_seek_data_actor);
if (ret < 0)
return ret;
if (ret == 0)
- break;
-
+ return offset;
offset += ret;
- length -= ret;
}
- if (length <= 0)
- return -ENXIO;
- return offset;
+ /* We've reached the end of the file without finding data */
+ return -ENXIO;
}
EXPORT_SYMBOL_GPL(iomap_seek_data);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51d1eb2ffeb9..746132998c57 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -701,7 +701,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
- percpu_counter_dec(&journal->j_jh_shrink_count);
+ percpu_counter_dec(&journal->j_checkpoint_jh_count);
jbd2_journal_put_journal_head(jh);
/* Is this transaction empty? */
@@ -764,7 +764,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
- percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count);
+ percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count);
}
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 152880c298ca..35302bc192eb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1283,6 +1283,48 @@ static int jbd2_min_tag_size(void)
return sizeof(journal_block_tag_t) - 4;
}
+/**
+ * jbd2_journal_shrink_scan()
+ *
+ * Scan the checkpointed buffer on the checkpoint list and release the
+ * journal_head.
+ */
+static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ unsigned long nr_to_scan = sc->nr_to_scan;
+ unsigned long nr_shrunk;
+ unsigned long count;
+
+ count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+ trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
+
+ nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
+
+ count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+ trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
+
+ return nr_shrunk;
+}
+
+/**
+ * jbd2_journal_shrink_count()
+ *
+ * Count the number of checkpoint buffers on the checkpoint list.
+ */
+static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ unsigned long count;
+
+ count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+ trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
+
+ return count;
+}
+
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
@@ -1361,9 +1403,23 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
+ journal->j_shrink_transaction = NULL;
+ journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
+ journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
+ journal->j_shrinker.seeks = DEFAULT_SEEKS;
+ journal->j_shrinker.batch = journal->j_max_transaction_buffers;
+
+ if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
+ goto err_cleanup;
+
+ if (register_shrinker(&journal->j_shrinker)) {
+ percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+ goto err_cleanup;
+ }
return journal;
err_cleanup:
+ brelse(journal->j_sb_buffer);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
kfree(journal);
@@ -2051,93 +2107,6 @@ recovery_error:
}
/**
- * jbd2_journal_shrink_scan()
- *
- * Scan the checkpointed buffer on the checkpoint list and release the
- * journal_head.
- */
-static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- journal_t *journal = container_of(shrink, journal_t, j_shrinker);
- unsigned long nr_to_scan = sc->nr_to_scan;
- unsigned long nr_shrunk;
- unsigned long count;
-
- count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
- trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
-
- nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
-
- count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
- trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
-
- return nr_shrunk;
-}
-
-/**
- * jbd2_journal_shrink_count()
- *
- * Count the number of checkpoint buffers on the checkpoint list.
- */
-static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- journal_t *journal = container_of(shrink, journal_t, j_shrinker);
- unsigned long count;
-
- count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
- trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
-
- return count;
-}
-
-/**
- * jbd2_journal_register_shrinker()
- * @journal: Journal to act on.
- *
- * Init a percpu counter to record the checkpointed buffers on the checkpoint
- * list and register a shrinker to release their journal_head.
- */
-int jbd2_journal_register_shrinker(journal_t *journal)
-{
- int err;
-
- journal->j_shrink_transaction = NULL;
-
- err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL);
- if (err)
- return err;
-
- journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
- journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
- journal->j_shrinker.seeks = DEFAULT_SEEKS;
- journal->j_shrinker.batch = journal->j_max_transaction_buffers;
-
- err = register_shrinker(&journal->j_shrinker);
- if (err) {
- percpu_counter_destroy(&journal->j_jh_shrink_count);
- return err;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(jbd2_journal_register_shrinker);
-
-/**
- * jbd2_journal_unregister_shrinker()
- * @journal: Journal to act on.
- *
- * Unregister the checkpointed buffer shrinker and destroy the percpu counter.
- */
-void jbd2_journal_unregister_shrinker(journal_t *journal)
-{
- percpu_counter_destroy(&journal->j_jh_shrink_count);
- unregister_shrinker(&journal->j_shrinker);
-}
-EXPORT_SYMBOL(jbd2_journal_unregister_shrinker);
-
-/**
* jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
@@ -2209,8 +2178,10 @@ int jbd2_journal_destroy(journal_t *journal)
brelse(journal->j_sb_buffer);
}
- jbd2_journal_unregister_shrinker(journal);
-
+ if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
+ percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+ unregister_shrinker(&journal->j_shrinker);
+ }
if (journal->j_proc_entry)
jbd2_stats_proc_exit(journal);
iput(journal->j_inode);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 3663dd5a23bc..57ab424c05ff 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -151,7 +151,8 @@ void jfs_evict_inode(struct inode *inode)
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
- diFree(inode);
+ if (JFS_SBI(inode->i_sb)->ipimap)
+ diFree(inode);
/*
* Free the inode from the quota allocation.
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index d6af79e94263..6b231d0d0071 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -101,7 +101,6 @@ struct dinode {
u8 unused[16]; /* 16: */
dxd_t _dxd; /* 16: */
union {
- __le32 _rdev; /* 4: */
/*
* The fast symlink area
* is expected to overflow
@@ -109,9 +108,15 @@ struct dinode {
* needed (which will clear
* INLINEEA).
*/
- u8 _fastsymlink[128];
- } _u;
- u8 _inlineea[128];
+ struct {
+ union {
+ __le32 _rdev; /* 4: */
+ u8 _fastsymlink[128];
+ } _u;
+ u8 _inlineea[128];
+ };
+ u8 _inline_all[256];
+ };
} _special;
} _u2;
} _file;
@@ -122,6 +127,7 @@ struct dinode {
#define di_rdev u._file._u2._special._u._rdev
#define di_fastsymlink u._file._u2._special._u._fastsymlink
#define di_inlineea u._file._u2._special._inlineea
+#define di_inline_all u._file._u2._special._inline_all
} u;
};
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 7aee15608619..91f4ec93dab1 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -3660,7 +3660,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
* (the leftmost ag with average free space in it);
*/
//agpref:
- /* get the number of active ags and inacitve ags */
+ /* get the number of active ags and inactive ags */
actags = bmp->db_maxag + 1;
inactags = bmp->db_numag - actags;
ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 937ca07b58b1..799d3837e7c2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -103,10 +103,8 @@ int diMount(struct inode *ipimap)
*/
/* allocate the in-memory inode map control structure. */
imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
- if (imap == NULL) {
- jfs_err("diMount: kmalloc returned NULL!");
+ if (imap == NULL)
return -ENOMEM;
- }
/* read the on-disk inode map control structure. */
@@ -763,7 +761,7 @@ int diWrite(tid_t tid, struct inode *ip)
lv = & dilinelock->lv[dilinelock->index];
lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
lv->length = 2;
- memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
+ memcpy(&dp->di_inline_all, jfs_ip->i_inline_all, IDATASIZE);
dilinelock->index++;
}
/*
@@ -3084,7 +3082,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
}
if (S_ISDIR(ip->i_mode)) {
- memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
+ memcpy(&jfs_ip->u.dir, &dip->u._dir, 384);
} else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
} else
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index a466ec41cfbb..721def69e732 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -77,11 +77,18 @@ struct jfs_inode_info {
unchar _unused[16]; /* 16: */
dxd_t _dxd; /* 16: */
/* _inline may overflow into _inline_ea when needed */
- unchar _inline[128]; /* 128: inline symlink */
/* _inline_ea may overlay the last part of
* file._xtroot if maxentry = XTROOTINITSLOT
*/
- unchar _inline_ea[128]; /* 128: inline extended attr */
+ union {
+ struct {
+ /* 128: inline symlink */
+ unchar _inline[128];
+ /* 128: inline extended attr */
+ unchar _inline_ea[128];
+ };
+ unchar _inline_all[256];
+ };
} link;
} u;
#ifdef CONFIG_QUOTA
@@ -96,6 +103,7 @@ struct jfs_inode_info {
#define i_dtroot u.dir._dtroot
#define i_inline u.link._inline
#define i_inline_ea u.link._inline_ea
+#define i_inline_all u.link._inline_all
#define IREAD_LOCK(ip, subclass) \
down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 9330eff210e0..78fd136ac13b 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1324,6 +1324,7 @@ int lmLogInit(struct jfs_log * log)
} else {
if (!uuid_equal(&logsuper->uuid, &log->uuid)) {
jfs_warn("wrong uuid on JFS log device");
+ rc = -EINVAL;
goto errout20;
}
log->size = le32_to_cpu(logsuper->size);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 053295cd7bc6..042bbe6d8ac2 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -105,7 +105,7 @@ static DEFINE_SPINLOCK(jfsTxnLock);
#define TXN_LOCK() spin_lock(&jfsTxnLock)
#define TXN_UNLOCK() spin_unlock(&jfsTxnLock)
-#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock);
+#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock)
#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags)
#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 1f0ffabbde56..9030aeaf0f88 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -939,7 +939,8 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
- offsetof(struct jfs_inode_info, i_inline), IDATASIZE,
+ offsetof(struct jfs_inode_info, i_inline_all),
+ sizeof_field(struct jfs_inode_info, i_inline_all),
init_once);
if (jfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 90d255fbdd9b..87aac4c72c37 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
get_fs_root(init_task.fs, &root);
task_unlock(&init_task);
- file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
+ file = file_open_root(&root, path, O_RDONLY, 0);
path_put(&root);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 7e0e62deab53..33166ec90a11 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -548,49 +548,6 @@ void kernfs_put(struct kernfs_node *kn)
}
EXPORT_SYMBOL_GPL(kernfs_put);
-static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
-{
- struct kernfs_node *kn;
-
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- /* Always perform fresh lookup for negatives */
- if (d_really_is_negative(dentry))
- goto out_bad_unlocked;
-
- kn = kernfs_dentry_node(dentry);
- mutex_lock(&kernfs_mutex);
-
- /* The kernfs node has been deactivated */
- if (!kernfs_active(kn))
- goto out_bad;
-
- /* The kernfs node has been moved? */
- if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
- goto out_bad;
-
- /* The kernfs node has been renamed */
- if (strcmp(dentry->d_name.name, kn->name) != 0)
- goto out_bad;
-
- /* The kernfs node has been moved to a different namespace */
- if (kn->parent && kernfs_ns_enabled(kn->parent) &&
- kernfs_info(dentry->d_sb)->ns != kn->ns)
- goto out_bad;
-
- mutex_unlock(&kernfs_mutex);
- return 1;
-out_bad:
- mutex_unlock(&kernfs_mutex);
-out_bad_unlocked:
- return 0;
-}
-
-const struct dentry_operations kernfs_dops = {
- .d_revalidate = kernfs_dop_revalidate,
-};
-
/**
* kernfs_node_from_dentry - determine kernfs_node associated with a dentry
* @dentry: the dentry in question
@@ -1073,6 +1030,49 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
return ERR_PTR(rc);
}
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct kernfs_node *kn;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ /* Always perform fresh lookup for negatives */
+ if (d_really_is_negative(dentry))
+ goto out_bad_unlocked;
+
+ kn = kernfs_dentry_node(dentry);
+ mutex_lock(&kernfs_mutex);
+
+ /* The kernfs node has been deactivated */
+ if (!kernfs_active(kn))
+ goto out_bad;
+
+ /* The kernfs node has been moved? */
+ if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
+ goto out_bad;
+
+ /* The kernfs node has been renamed */
+ if (strcmp(dentry->d_name.name, kn->name) != 0)
+ goto out_bad;
+
+ /* The kernfs node has been moved to a different namespace */
+ if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+ kernfs_info(dentry->d_sb)->ns != kn->ns)
+ goto out_bad;
+
+ mutex_unlock(&kernfs_mutex);
+ return 1;
+out_bad:
+ mutex_unlock(&kernfs_mutex);
+out_bad_unlocked:
+ return 0;
+}
+
+const struct dentry_operations kernfs_dops = {
+ .d_revalidate = kernfs_dop_revalidate,
+};
+
static struct dentry *kernfs_iop_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a639e34847d..2de048f80eb8 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -766,6 +766,46 @@ static void __exit exit_nlm(void)
module_init(init_nlm);
module_exit(exit_nlm);
+/**
+ * nlmsvc_dispatch - Process an NLM Request
+ * @rqstp: incoming request
+ * @statp: pointer to location of accept_stat field in RPC Reply buffer
+ *
+ * Return values:
+ * %0: Processing complete; do not send a Reply
+ * %1: Processing complete; send Reply in rqstp->rq_res
+ */
+static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+ const struct svc_procedure *procp = rqstp->rq_procinfo;
+ struct kvec *argv = rqstp->rq_arg.head;
+ struct kvec *resv = rqstp->rq_res.head;
+
+ svcxdr_init_decode(rqstp);
+ if (!procp->pc_decode(rqstp, argv->iov_base))
+ goto out_decode_err;
+
+ *statp = procp->pc_func(rqstp);
+ if (*statp == rpc_drop_reply)
+ return 0;
+ if (*statp != rpc_success)
+ return 1;
+
+ svcxdr_init_encode(rqstp);
+ if (!procp->pc_encode(rqstp, resv->iov_base + resv->iov_len))
+ goto out_encode_err;
+
+ return 1;
+
+out_decode_err:
+ *statp = rpc_garbage_args;
+ return 1;
+
+out_encode_err:
+ *statp = rpc_system_err;
+ return 1;
+}
+
/*
* Define NLM program and procedures
*/
@@ -775,6 +815,7 @@ static const struct svc_version nlmsvc_version1 = {
.vs_nproc = 17,
.vs_proc = nlmsvc_procedures,
.vs_count = nlmsvc_version1_count,
+ .vs_dispatch = nlmsvc_dispatch,
.vs_xdrsize = NLMSVC_XDRSIZE,
};
static unsigned int nlmsvc_version3_count[24];
@@ -783,6 +824,7 @@ static const struct svc_version nlmsvc_version3 = {
.vs_nproc = 24,
.vs_proc = nlmsvc_procedures,
.vs_count = nlmsvc_version3_count,
+ .vs_dispatch = nlmsvc_dispatch,
.vs_xdrsize = NLMSVC_XDRSIZE,
};
#ifdef CONFIG_LOCKD_V4
@@ -792,6 +834,7 @@ static const struct svc_version nlmsvc_version4 = {
.vs_nproc = 24,
.vs_proc = nlmsvc_procedures4,
.vs_count = nlmsvc_version4_count,
+ .vs_dispatch = nlmsvc_dispatch,
.vs_xdrsize = NLMSVC_XDRSIZE,
};
#endif
diff --git a/fs/lockd/svcxdr.h b/fs/lockd/svcxdr.h
new file mode 100644
index 000000000000..c69a0bb76c94
--- /dev/null
+++ b/fs/lockd/svcxdr.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Encode/decode NLM basic data types
+ *
+ * Basic NLMv3 XDR data types are not defined in an IETF standards
+ * document. X/Open has a description of these data types that
+ * is useful. See Chapter 10 of "Protocols for Interworking:
+ * XNFS, Version 3W".
+ *
+ * Basic NLMv4 XDR data types are defined in Appendix II.1.4 of
+ * RFC 1813: "NFS Version 3 Protocol Specification".
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2020, Oracle and/or its affiliates.
+ */
+
+#ifndef _LOCKD_SVCXDR_H_
+#define _LOCKD_SVCXDR_H_
+
+static inline bool
+svcxdr_decode_stats(struct xdr_stream *xdr, __be32 *status)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, XDR_UNIT);
+ if (!p)
+ return false;
+ *status = *p;
+
+ return true;
+}
+
+static inline bool
+svcxdr_encode_stats(struct xdr_stream *xdr, __be32 status)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!p)
+ return false;
+ *p = status;
+
+ return true;
+}
+
+static inline bool
+svcxdr_decode_string(struct xdr_stream *xdr, char **data, unsigned int *data_len)
+{
+ __be32 *p;
+ u32 len;
+
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return false;
+ if (len > NLM_MAXSTRLEN)
+ return false;
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return false;
+ *data_len = len;
+ *data = (char *)p;
+
+ return true;
+}
+
+/*
+ * NLM cookies are defined by specification to be a variable-length
+ * XDR opaque no longer than 1024 bytes. However, this implementation
+ * limits their length to 32 bytes, and treats zero-length cookies
+ * specially.
+ */
+static inline bool
+svcxdr_decode_cookie(struct xdr_stream *xdr, struct nlm_cookie *cookie)
+{
+ __be32 *p;
+ u32 len;
+
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return false;
+ if (len > NLM_MAXCOOKIELEN)
+ return false;
+ if (!len)
+ goto out_hpux;
+
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return false;
+ cookie->len = len;
+ memcpy(cookie->data, p, len);
+
+ return true;
+
+ /* apparently HPUX can return empty cookies */
+out_hpux:
+ cookie->len = 4;
+ memset(cookie->data, 0, 4);
+ return true;
+}
+
+static inline bool
+svcxdr_encode_cookie(struct xdr_stream *xdr, const struct nlm_cookie *cookie)
+{
+ __be32 *p;
+
+ if (xdr_stream_encode_u32(xdr, cookie->len) < 0)
+ return false;
+ p = xdr_reserve_space(xdr, cookie->len);
+ if (!p)
+ return false;
+ memcpy(p, cookie->data, cookie->len);
+
+ return true;
+}
+
+static inline bool
+svcxdr_decode_owner(struct xdr_stream *xdr, struct xdr_netobj *obj)
+{
+ __be32 *p;
+ u32 len;
+
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return false;
+ if (len > XDR_MAX_NETOBJ)
+ return false;
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return false;
+ obj->len = len;
+ obj->data = (u8 *)p;
+
+ return true;
+}
+
+static inline bool
+svcxdr_encode_owner(struct xdr_stream *xdr, const struct xdr_netobj *obj)
+{
+ unsigned int quadlen = XDR_QUADLEN(obj->len);
+ __be32 *p;
+
+ if (xdr_stream_encode_u32(xdr, obj->len) < 0)
+ return false;
+ p = xdr_reserve_space(xdr, obj->len);
+ if (!p)
+ return false;
+ p[quadlen - 1] = 0; /* XDR pad */
+ memcpy(p, obj->data, obj->len);
+
+ return true;
+}
+
+#endif /* _LOCKD_SVCXDR_H_ */
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 982629f7b120..9235e60b1769 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -19,7 +19,7 @@
#include <uapi/linux/nfs2.h>
-#define NLMDBG_FACILITY NLMDBG_XDR
+#include "svcxdr.h"
static inline loff_t
@@ -42,311 +42,323 @@ loff_t_to_s32(loff_t offset)
}
/*
- * XDR functions for basic NLM types
+ * NLM file handles are defined by specification to be a variable-length
+ * XDR opaque no longer than 1024 bytes. However, this implementation
+ * constrains their length to exactly the length of an NFSv2 file
+ * handle.
*/
-static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
+static bool
+svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
{
- unsigned int len;
-
- len = ntohl(*p++);
-
- if(len==0)
- {
- c->len=4;
- memset(c->data, 0, 4); /* hockeypux brain damage */
- }
- else if(len<=NLM_MAXCOOKIELEN)
- {
- c->len=len;
- memcpy(c->data, p, len);
- p+=XDR_QUADLEN(len);
- }
- else
- {
- dprintk("lockd: bad cookie size %d (only cookies under "
- "%d bytes are supported.)\n",
- len, NLM_MAXCOOKIELEN);
- return NULL;
- }
- return p;
-}
-
-static inline __be32 *
-nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
-{
- *p++ = htonl(c->len);
- memcpy(p, c->data, c->len);
- p+=XDR_QUADLEN(c->len);
- return p;
-}
-
-static __be32 *
-nlm_decode_fh(__be32 *p, struct nfs_fh *f)
-{
- unsigned int len;
-
- if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
- dprintk("lockd: bad fhandle size %d (should be %d)\n",
- len, NFS2_FHSIZE);
- return NULL;
- }
- f->size = NFS2_FHSIZE;
- memset(f->data, 0, sizeof(f->data));
- memcpy(f->data, p, NFS2_FHSIZE);
- return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
-
-/*
- * Encode and decode owner handle
- */
-static inline __be32 *
-nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
-{
- return xdr_decode_netobj(p, oh);
-}
-
-static inline __be32 *
-nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
-{
- return xdr_encode_netobj(p, oh);
+ __be32 *p;
+ u32 len;
+
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return false;
+ if (len != NFS2_FHSIZE)
+ return false;
+
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return false;
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, len);
+ memset(fh->data + NFS2_FHSIZE, 0, sizeof(fh->data) - NFS2_FHSIZE);
+
+ return true;
}
-static __be32 *
-nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
+static bool
+svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
{
- struct file_lock *fl = &lock->fl;
- s32 start, len, end;
-
- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
- &lock->len,
- NLM_MAXSTRLEN))
- || !(p = nlm_decode_fh(p, &lock->fh))
- || !(p = nlm_decode_oh(p, &lock->oh)))
- return NULL;
- lock->svid = ntohl(*p++);
+ struct file_lock *fl = &lock->fl;
+ s32 start, len, end;
+
+ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
+ return false;
+ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
+ return false;
+ if (!svcxdr_decode_owner(xdr, &lock->oh))
+ return false;
+ if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
+ return false;
+ if (xdr_stream_decode_u32(xdr, &start) < 0)
+ return false;
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return false;
locks_init_lock(fl);
fl->fl_flags = FL_POSIX;
- fl->fl_type = F_RDLCK; /* as good as anything else */
- start = ntohl(*p++);
- len = ntohl(*p++);
+ fl->fl_type = F_RDLCK;
end = start + len - 1;
-
fl->fl_start = s32_to_loff_t(start);
-
if (len == 0 || end < 0)
fl->fl_end = OFFSET_MAX;
else
fl->fl_end = s32_to_loff_t(end);
- return p;
+
+ return true;
}
-/*
- * Encode result of a TEST/TEST_MSG call
- */
-static __be32 *
-nlm_encode_testres(__be32 *p, struct nlm_res *resp)
+static bool
+svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
{
- s32 start, len;
-
- if (!(p = nlm_encode_cookie(p, &resp->cookie)))
- return NULL;
- *p++ = resp->status;
-
- if (resp->status == nlm_lck_denied) {
- struct file_lock *fl = &resp->lock.fl;
-
- *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
- *p++ = htonl(resp->lock.svid);
-
- /* Encode owner handle. */
- if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
- return NULL;
+ const struct file_lock *fl = &lock->fl;
+ s32 start, len;
+
+ /* exclusive */
+ if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+ return false;
+ if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
+ return false;
+ if (!svcxdr_encode_owner(xdr, &lock->oh))
+ return false;
+ start = loff_t_to_s32(fl->fl_start);
+ if (fl->fl_end == OFFSET_MAX)
+ len = 0;
+ else
+ len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+ if (xdr_stream_encode_u32(xdr, start) < 0)
+ return false;
+ if (xdr_stream_encode_u32(xdr, len) < 0)
+ return false;
- start = loff_t_to_s32(fl->fl_start);
- if (fl->fl_end == OFFSET_MAX)
- len = 0;
- else
- len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
+ return true;
+}
- *p++ = htonl(start);
- *p++ = htonl(len);
+static bool
+svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
+{
+ if (!svcxdr_encode_stats(xdr, resp->status))
+ return false;
+ switch (resp->status) {
+ case nlm_lck_denied:
+ if (!svcxdr_encode_holder(xdr, &resp->lock))
+ return false;
}
- return p;
+ return true;
}
/*
- * First, the server side XDR functions
+ * Decode Call arguments
*/
+
+int
+nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
int
nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- u32 exclusive;
+ u32 exclusive;
- if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
return 0;
-
- exclusive = ntohl(*p++);
- if (!(p = nlm_decode_lock(p, &argp->lock)))
+ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return xdr_argsize_check(rqstp, p);
-}
-
-int
-nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nlm_res *resp = rqstp->rq_resp;
-
- if (!(p = nlm_encode_testres(p, resp)))
- return 0;
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
int
nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- u32 exclusive;
+ u32 exclusive;
- if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
return 0;
- argp->block = ntohl(*p++);
- exclusive = ntohl(*p++);
- if (!(p = nlm_decode_lock(p, &argp->lock)))
+ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- argp->reclaim = ntohl(*p++);
- argp->state = ntohl(*p++);
+ if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
+ return 0;
argp->monitor = 1; /* monitor client by default */
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- u32 exclusive;
+ u32 exclusive;
- if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
return 0;
- argp->block = ntohl(*p++);
- exclusive = ntohl(*p++);
- if (!(p = nlm_decode_lock(p, &argp->lock)))
+ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
int
nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- if (!(p = nlm_decode_cookie(p, &argp->cookie))
- || !(p = nlm_decode_lock(p, &argp->lock)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
argp->lock.fl.fl_type = F_UNLCK;
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
int
-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_args *argp = rqstp->rq_argp;
- struct nlm_lock *lock = &argp->lock;
-
- memset(lock, 0, sizeof(*lock));
- locks_init_lock(&lock->fl);
- lock->svid = ~(u32) 0;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nlm_res *resp = rqstp->rq_argp;
- if (!(p = nlm_decode_cookie(p, &argp->cookie))
- || !(p = xdr_decode_string_inplace(p, &lock->caller,
- &lock->len, NLM_MAXSTRLEN))
- || !(p = nlm_decode_fh(p, &lock->fh))
- || !(p = nlm_decode_oh(p, &lock->oh)))
+ if (!svcxdr_decode_cookie(xdr, &resp->cookie))
+ return 0;
+ if (!svcxdr_decode_stats(xdr, &resp->status))
return 0;
- argp->fsm_mode = ntohl(*p++);
- argp->fsm_access = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
int
-nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_res *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nlm_reboot *argp = rqstp->rq_argp;
+ u32 len;
- if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return 0;
+ if (len > SM_MAXSTRLEN)
+ return 0;
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return 0;
+ argp->len = len;
+ argp->mon = (char *)p;
+ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
+ return 0;
+ p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
+ if (!p)
return 0;
- *p++ = resp->status;
- *p++ = xdr_zero; /* sequence argument */
- return xdr_ressize_check(rqstp, p);
+ memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+
+ return 1;
}
int
-nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_res *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nlm_args *argp = rqstp->rq_argp;
+ struct nlm_lock *lock = &argp->lock;
- if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+ memset(lock, 0, sizeof(*lock));
+ locks_init_lock(&lock->fl);
+ lock->svid = ~(u32)0;
+
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
+ return 0;
+ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
+ return 0;
+ if (!svcxdr_decode_owner(xdr, &lock->oh))
+ return 0;
+ /* XXX: Range checks are missing in the original code */
+ if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
return 0;
- *p++ = resp->status;
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
}
int
nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
- &lock->len, NLM_MAXSTRLEN)))
+ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
return 0;
- argp->state = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
+
+/*
+ * Encode Reply results
+ */
+
int
-nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_reboot *argp = rqstp->rq_argp;
-
- if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
- return 0;
- argp->state = ntohl(*p++);
- memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
- p += XDR_QUADLEN(SM_PRIV_SIZE);
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
-nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_res *resp = rqstp->rq_argp;
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
+ struct nlm_res *resp = rqstp->rq_resp;
- if (!(p = nlm_decode_cookie(p, &resp->cookie)))
- return 0;
- resp->status = *p++;
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
+ svcxdr_encode_testrply(xdr, resp);
}
int
-nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p)
{
- return xdr_argsize_check(rqstp, p);
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
+ struct nlm_res *resp = rqstp->rq_resp;
+
+ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
+ svcxdr_encode_stats(xdr, resp->status);
}
int
-nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
{
- return xdr_ressize_check(rqstp, p);
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
+ struct nlm_res *resp = rqstp->rq_resp;
+
+ if (!svcxdr_encode_cookie(xdr, &resp->cookie))
+ return 0;
+ if (!svcxdr_encode_stats(xdr, resp->status))
+ return 0;
+ /* sequence */
+ if (xdr_stream_encode_u32(xdr, 0) < 0)
+ return 0;
+
+ return 1;
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 5fa9f48a9dba..98e957e4566c 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -18,7 +18,7 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
-#define NLMDBG_FACILITY NLMDBG_XDR
+#include "svcxdr.h"
static inline loff_t
s64_to_loff_t(__s64 offset)
@@ -41,309 +41,322 @@ loff_t_to_s64(loff_t offset)
}
/*
- * XDR functions for basic NLM types
+ * NLM file handles are defined by specification to be a variable-length
+ * XDR opaque no longer than 1024 bytes. However, this implementation
+ * limits their length to the size of an NFSv3 file handle.
*/
-static __be32 *
-nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
+static bool
+svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
{
- unsigned int len;
-
- len = ntohl(*p++);
-
- if(len==0)
- {
- c->len=4;
- memset(c->data, 0, 4); /* hockeypux brain damage */
- }
- else if(len<=NLM_MAXCOOKIELEN)
- {
- c->len=len;
- memcpy(c->data, p, len);
- p+=XDR_QUADLEN(len);
- }
- else
- {
- dprintk("lockd: bad cookie size %d (only cookies under "
- "%d bytes are supported.)\n",
- len, NLM_MAXCOOKIELEN);
- return NULL;
- }
- return p;
-}
-
-static __be32 *
-nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
-{
- *p++ = htonl(c->len);
- memcpy(p, c->data, c->len);
- p+=XDR_QUADLEN(c->len);
- return p;
-}
-
-static __be32 *
-nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
-{
- memset(f->data, 0, sizeof(f->data));
- f->size = ntohl(*p++);
- if (f->size > NFS_MAXFHSIZE) {
- dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
- f->size, NFS_MAXFHSIZE);
- return NULL;
- }
- memcpy(f->data, p, f->size);
- return p + XDR_QUADLEN(f->size);
-}
-
-/*
- * Encode and decode owner handle
- */
-static __be32 *
-nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
-{
- return xdr_decode_netobj(p, oh);
+ __be32 *p;
+ u32 len;
+
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
+ return false;
+ if (len > NFS_MAXFHSIZE)
+ return false;
+
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return false;
+ fh->size = len;
+ memcpy(fh->data, p, len);
+ memset(fh->data + len, 0, sizeof(fh->data) - len);
+
+ return true;
}
-static __be32 *
-nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
+static bool
+svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
{
- struct file_lock *fl = &lock->fl;
- __u64 len, start;
- __s64 end;
-
- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
- &lock->len, NLM_MAXSTRLEN))
- || !(p = nlm4_decode_fh(p, &lock->fh))
- || !(p = nlm4_decode_oh(p, &lock->oh)))
- return NULL;
- lock->svid = ntohl(*p++);
+ struct file_lock *fl = &lock->fl;
+ u64 len, start;
+ s64 end;
+
+ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
+ return false;
+ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
+ return false;
+ if (!svcxdr_decode_owner(xdr, &lock->oh))
+ return false;
+ if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
+ return false;
+ if (xdr_stream_decode_u64(xdr, &start) < 0)
+ return false;
+ if (xdr_stream_decode_u64(xdr, &len) < 0)
+ return false;
locks_init_lock(fl);
fl->fl_flags = FL_POSIX;
- fl->fl_type = F_RDLCK; /* as good as anything else */
- p = xdr_decode_hyper(p, &start);
- p = xdr_decode_hyper(p, &len);
+ fl->fl_type = F_RDLCK;
end = start + len - 1;
-
fl->fl_start = s64_to_loff_t(start);
-
if (len == 0 || end < 0)
fl->fl_end = OFFSET_MAX;
else
fl->fl_end = s64_to_loff_t(end);
- return p;
+
+ return true;
}
-/*
- * Encode result of a TEST/TEST_MSG call
- */
-static __be32 *
-nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
+static bool
+svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
+{
+ const struct file_lock *fl = &lock->fl;
+ s64 start, len;
+
+ /* exclusive */
+ if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+ return false;
+ if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
+ return false;
+ if (!svcxdr_encode_owner(xdr, &lock->oh))
+ return false;
+ start = loff_t_to_s64(fl->fl_start);
+ if (fl->fl_end == OFFSET_MAX)
+ len = 0;
+ else
+ len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+ if (xdr_stream_encode_u64(xdr, start) < 0)
+ return false;
+ if (xdr_stream_encode_u64(xdr, len) < 0)
+ return false;
+
+ return true;
+}
+
+static bool
+svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
{
- s64 start, len;
-
- dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp);
- if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
- return NULL;
- *p++ = resp->status;
-
- if (resp->status == nlm_lck_denied) {
- struct file_lock *fl = &resp->lock.fl;
-
- *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
- *p++ = htonl(resp->lock.svid);
-
- /* Encode owner handle. */
- if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
- return NULL;
-
- start = loff_t_to_s64(fl->fl_start);
- if (fl->fl_end == OFFSET_MAX)
- len = 0;
- else
- len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
-
- p = xdr_encode_hyper(p, start);
- p = xdr_encode_hyper(p, len);
- dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n",
- resp->status, (int)resp->lock.svid, fl->fl_type,
- (long long)fl->fl_start, (long long)fl->fl_end);
+ if (!svcxdr_encode_stats(xdr, resp->status))
+ return false;
+ switch (resp->status) {
+ case nlm_lck_denied:
+ if (!svcxdr_encode_holder(xdr, &resp->lock))
+ return false;
}
- dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp);
- return p;
+ return true;
}
/*
- * First, the server side XDR functions
+ * Decode Call arguments
*/
+
+int
+nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
int
nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- u32 exclusive;
+ u32 exclusive;
- if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
return 0;
-
- exclusive = ntohl(*p++);
- if (!(p = nlm4_decode_lock(p, &argp->lock)))
+ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return xdr_argsize_check(rqstp, p);
-}
-
-int
-nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nlm_res *resp = rqstp->rq_resp;
-
- if (!(p = nlm4_encode_testres(p, resp)))
- return 0;
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
int
nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- u32 exclusive;
+ u32 exclusive;
- if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
return 0;
- argp->block = ntohl(*p++);
- exclusive = ntohl(*p++);
- if (!(p = nlm4_decode_lock(p, &argp->lock)))
+ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- argp->reclaim = ntohl(*p++);
- argp->state = ntohl(*p++);
+ if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
+ return 0;
argp->monitor = 1; /* monitor client by default */
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- u32 exclusive;
+ u32 exclusive;
- if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
return 0;
- argp->block = ntohl(*p++);
- exclusive = ntohl(*p++);
- if (!(p = nlm4_decode_lock(p, &argp->lock)))
+ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
if (exclusive)
argp->lock.fl.fl_type = F_WRLCK;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
- if (!(p = nlm4_decode_cookie(p, &argp->cookie))
- || !(p = nlm4_decode_lock(p, &argp->lock)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
+ return 0;
+ if (!svcxdr_decode_lock(xdr, &argp->lock))
return 0;
argp->lock.fl.fl_type = F_UNLCK;
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
int
-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_args *argp = rqstp->rq_argp;
- struct nlm_lock *lock = &argp->lock;
-
- memset(lock, 0, sizeof(*lock));
- locks_init_lock(&lock->fl);
- lock->svid = ~(u32) 0;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nlm_res *resp = rqstp->rq_argp;
- if (!(p = nlm4_decode_cookie(p, &argp->cookie))
- || !(p = xdr_decode_string_inplace(p, &lock->caller,
- &lock->len, NLM_MAXSTRLEN))
- || !(p = nlm4_decode_fh(p, &lock->fh))
- || !(p = nlm4_decode_oh(p, &lock->oh)))
+ if (!svcxdr_decode_cookie(xdr, &resp->cookie))
+ return 0;
+ if (!svcxdr_decode_stats(xdr, &resp->status))
return 0;
- argp->fsm_mode = ntohl(*p++);
- argp->fsm_access = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
int
-nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_res *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nlm_reboot *argp = rqstp->rq_argp;
+ u32 len;
- if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+ if (xdr_stream_decode_u32(xdr, &len) < 0)
return 0;
- *p++ = resp->status;
- *p++ = xdr_zero; /* sequence argument */
- return xdr_ressize_check(rqstp, p);
+ if (len > SM_MAXSTRLEN)
+ return 0;
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ return 0;
+ argp->len = len;
+ argp->mon = (char *)p;
+ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
+ return 0;
+ p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
+ if (!p)
+ return 0;
+ memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+
+ return 1;
}
int
-nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_res *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nlm_args *argp = rqstp->rq_argp;
+ struct nlm_lock *lock = &argp->lock;
+
+ memset(lock, 0, sizeof(*lock));
+ locks_init_lock(&lock->fl);
+ lock->svid = ~(u32)0;
- if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
return 0;
- *p++ = resp->status;
- return xdr_ressize_check(rqstp, p);
+ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
+ return 0;
+ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
+ return 0;
+ if (!svcxdr_decode_owner(xdr, &lock->oh))
+ return 0;
+ /* XXX: Range checks are missing in the original code */
+ if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
+ return 0;
+
+ return 1;
}
int
nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_lock *lock = &argp->lock;
- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
- &lock->len, NLM_MAXSTRLEN)))
+ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
return 0;
- argp->state = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+
+ return 1;
}
+
+/*
+ * Encode Reply results
+ */
+
int
-nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_reboot *argp = rqstp->rq_argp;
-
- if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
- return 0;
- argp->state = ntohl(*p++);
- memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
- p += XDR_QUADLEN(SM_PRIV_SIZE);
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
-nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
{
- struct nlm_res *resp = rqstp->rq_argp;
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
+ struct nlm_res *resp = rqstp->rq_resp;
- if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
- return 0;
- resp->status = *p++;
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
+ svcxdr_encode_testrply(xdr, resp);
}
int
-nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p)
{
- return xdr_argsize_check(rqstp, p);
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
+ struct nlm_res *resp = rqstp->rq_resp;
+
+ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
+ svcxdr_encode_stats(xdr, resp->status);
}
int
-nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
{
- return xdr_ressize_check(rqstp, p);
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
+ struct nlm_res *resp = rqstp->rq_resp;
+
+ if (!svcxdr_encode_cookie(xdr, &resp->cookie))
+ return 0;
+ if (!svcxdr_encode_stats(xdr, resp->status))
+ return 0;
+ /* sequence */
+ if (xdr_stream_encode_u32(xdr, 0) < 0)
+ return 0;
+
+ return 1;
}
diff --git a/fs/namei.c b/fs/namei.c
index 79b0ff9b151e..bf6d8a738c59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -554,7 +554,7 @@ struct nameidata {
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
- unsigned int flags;
+ unsigned int flags, state;
unsigned seq, m_seq, r_seq;
int last_type;
unsigned depth;
@@ -573,10 +573,15 @@ struct nameidata {
umode_t dir_mode;
} __randomize_layout;
-static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
+#define ND_ROOT_PRESET 1
+#define ND_ROOT_GRABBED 2
+#define ND_JUMPED 4
+
+static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
struct nameidata *old = current->nameidata;
p->stack = p->internal;
+ p->depth = 0;
p->dfd = dfd;
p->name = name;
p->path.mnt = NULL;
@@ -586,6 +591,17 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
current->nameidata = p;
}
+static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
+ const struct path *root)
+{
+ __set_nameidata(p, dfd, name);
+ p->state = 0;
+ if (unlikely(root)) {
+ p->state = ND_ROOT_PRESET;
+ p->root = *root;
+ }
+}
+
static void restore_nameidata(void)
{
struct nameidata *now = current->nameidata, *old = now->saved;
@@ -645,9 +661,9 @@ static void terminate_walk(struct nameidata *nd)
path_put(&nd->path);
for (i = 0; i < nd->depth; i++)
path_put(&nd->stack[i].link);
- if (nd->flags & LOOKUP_ROOT_GRABBED) {
+ if (nd->state & ND_ROOT_GRABBED) {
path_put(&nd->root);
- nd->flags &= ~LOOKUP_ROOT_GRABBED;
+ nd->state &= ~ND_ROOT_GRABBED;
}
} else {
nd->flags &= ~LOOKUP_RCU;
@@ -710,9 +726,9 @@ static bool legitimize_root(struct nameidata *nd)
if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
return false;
/* Nothing to do if nd->root is zero or is managed by the VFS user. */
- if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
+ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
return true;
- nd->flags |= LOOKUP_ROOT_GRABBED;
+ nd->state |= ND_ROOT_GRABBED;
return legitimize_path(nd, &nd->root, nd->root_seq);
}
@@ -849,8 +865,9 @@ static int complete_walk(struct nameidata *nd)
* We don't want to zero nd->root for scoped-lookups or
* externally-managed nd->root.
*/
- if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
- nd->root.mnt = NULL;
+ if (!(nd->state & ND_ROOT_PRESET))
+ if (!(nd->flags & LOOKUP_IS_SCOPED))
+ nd->root.mnt = NULL;
nd->flags &= ~LOOKUP_CACHED;
if (!try_to_unlazy(nd))
return -ECHILD;
@@ -877,7 +894,7 @@ static int complete_walk(struct nameidata *nd)
return -EXDEV;
}
- if (likely(!(nd->flags & LOOKUP_JUMPED)))
+ if (likely(!(nd->state & ND_JUMPED)))
return 0;
if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
@@ -915,7 +932,7 @@ static int set_root(struct nameidata *nd)
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_root(fs, &nd->root);
- nd->flags |= LOOKUP_ROOT_GRABBED;
+ nd->state |= ND_ROOT_GRABBED;
}
return 0;
}
@@ -948,7 +965,7 @@ static int nd_jump_root(struct nameidata *nd)
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
- nd->flags |= LOOKUP_JUMPED;
+ nd->state |= ND_JUMPED;
return 0;
}
@@ -976,7 +993,7 @@ int nd_jump_link(struct path *path)
path_put(&nd->path);
nd->path = *path;
nd->inode = nd->path.dentry->d_inode;
- nd->flags |= LOOKUP_JUMPED;
+ nd->state |= ND_JUMPED;
return 0;
err:
@@ -1423,7 +1440,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
if (mounted) {
path->mnt = &mounted->mnt;
dentry = path->dentry = mounted->mnt.mnt_root;
- nd->flags |= LOOKUP_JUMPED;
+ nd->state |= ND_JUMPED;
*seqp = read_seqcount_begin(&dentry->d_seq);
*inode = dentry->d_inode;
/*
@@ -1468,7 +1485,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
if (unlikely(nd->flags & LOOKUP_NO_XDEV))
ret = -EXDEV;
else
- nd->flags |= LOOKUP_JUMPED;
+ nd->state |= ND_JUMPED;
}
if (unlikely(ret)) {
dput(path->dentry);
@@ -2219,7 +2236,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
- nd->flags |= LOOKUP_JUMPED;
+ nd->state |= ND_JUMPED;
}
break;
case 1:
@@ -2227,7 +2244,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
- nd->flags &= ~LOOKUP_JUMPED;
+ nd->state &= ~ND_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
struct qstr this = { { .hash_len = hash_len }, .name = name };
err = parent->d_op->d_hash(parent, &this);
@@ -2301,14 +2318,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
if (flags & LOOKUP_RCU)
rcu_read_lock();
- nd->flags = flags | LOOKUP_JUMPED;
- nd->depth = 0;
+ nd->flags = flags;
+ nd->state |= ND_JUMPED;
nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
smp_rmb();
- if (flags & LOOKUP_ROOT) {
+ if (nd->state & ND_ROOT_PRESET) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
if (*s && unlikely(!d_can_lookup(root)))
@@ -2383,7 +2400,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->root_seq = nd->seq;
} else {
path_get(&nd->root);
- nd->flags |= LOOKUP_ROOT_GRABBED;
+ nd->state |= ND_ROOT_GRABBED;
}
}
return s;
@@ -2422,7 +2439,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
;
if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
err = handle_lookup_down(nd);
- nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+ nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
}
if (!err)
err = complete_walk(nd);
@@ -2446,11 +2463,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct nameidata nd;
if (IS_ERR(name))
return PTR_ERR(name);
- if (unlikely(root)) {
- nd.root = *root;
- flags |= LOOKUP_ROOT;
- }
- set_nameidata(&nd, dfd, name);
+ set_nameidata(&nd, dfd, name, root);
retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(&nd, flags, path);
@@ -2491,7 +2504,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name,
if (IS_ERR(name))
return name;
- set_nameidata(&nd, dfd, name);
+ set_nameidata(&nd, dfd, name, NULL);
retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
if (unlikely(retval == -ECHILD))
retval = path_parentat(&nd, flags, parent);
@@ -3517,7 +3530,7 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
int flags = op->lookup_flags;
struct file *filp;
- set_nameidata(&nd, dfd, pathname);
+ set_nameidata(&nd, dfd, pathname, NULL);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
@@ -3527,25 +3540,22 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
return filp;
}
-struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+struct file *do_file_open_root(const struct path *root,
const char *name, const struct open_flags *op)
{
struct nameidata nd;
struct file *file;
struct filename *filename;
- int flags = op->lookup_flags | LOOKUP_ROOT;
-
- nd.root.mnt = mnt;
- nd.root.dentry = dentry;
+ int flags = op->lookup_flags;
- if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
+ if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);
filename = getname_kernel(name);
if (IS_ERR(filename))
return ERR_CAST(filename);
- set_nameidata(&nd, -1, filename);
+ set_nameidata(&nd, -1, filename, root);
file = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(file == ERR_PTR(-ECHILD)))
file = path_openat(&nd, op, flags);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2279473d0d6f..97adcb5ab5d5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1942,6 +1942,20 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ struct mount *child;
+
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, dentry))
+ continue;
+
+ if (child->mnt.mnt_flags & MNT_LOCKED)
+ return true;
+ }
+ return false;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -1957,10 +1971,19 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
+ down_read(&namespace_sem);
if (IS_MNT_UNBINDABLE(old_mnt))
- return ERR_PTR(-EINVAL);
+ goto invalid;
+
+ if (!check_mnt(old_mnt))
+ goto invalid;
+
+ if (has_locked_children(old_mnt, path->dentry))
+ goto invalid;
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
@@ -1968,6 +1991,10 @@ struct vfsmount *clone_private_mount(const struct path *path)
new_mnt->mnt_ns = MNT_NS_INTERNAL;
return &new_mnt->mnt;
+
+invalid:
+ up_read(&namespace_sem);
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2319,19 +2346,6 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
-{
- struct mount *child;
- list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
- if (!is_subdir(child->mnt_mountpoint, dentry))
- continue;
-
- if (child->mnt.mnt_flags & MNT_LOCKED)
- return true;
- }
- return false;
-}
-
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index e6ec6f09ac6e..11118398f495 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -75,6 +75,13 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
}
+static void nfs_mark_return_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
static bool
nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
fmode_t flags)
@@ -293,6 +300,7 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
goto out;
spin_lock(&delegation->lock);
if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
/* Refcount matched in nfs_end_delegation_return() */
ret = nfs_get_delegation(delegation);
}
@@ -314,16 +322,17 @@ nfs_start_delegation_return(struct nfs_inode *nfsi)
return delegation;
}
-static void
-nfs_abort_delegation_return(struct nfs_delegation *delegation,
- struct nfs_client *clp)
+static void nfs_abort_delegation_return(struct nfs_delegation *delegation,
+ struct nfs_client *clp, int err)
{
spin_lock(&delegation->lock);
clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
- set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ if (err == -EAGAIN) {
+ set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state);
+ }
spin_unlock(&delegation->lock);
- set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
}
static struct nfs_delegation *
@@ -521,11 +530,18 @@ out:
static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ unsigned int mode = O_WRONLY | O_RDWR;
int err = 0;
if (delegation == NULL)
return 0;
- do {
+
+ if (!issync)
+ mode |= O_NONBLOCK;
+ /* Recall of any remaining application leases */
+ err = break_lease(inode, mode);
+
+ while (err == 0) {
if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
break;
err = nfs_delegation_claim_opens(inode, &delegation->stateid,
@@ -536,10 +552,10 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
* Guard against state recovery
*/
err = nfs4_wait_clnt_recover(clp);
- } while (err == 0);
+ }
if (err) {
- nfs_abort_delegation_return(delegation, clp);
+ nfs_abort_delegation_return(delegation, clp, err);
goto out;
}
@@ -568,6 +584,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
if (ret)
clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) ||
test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
ret = false;
@@ -647,6 +664,38 @@ out:
return err;
}
+static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *d;
+ bool ret = false;
+
+ list_for_each_entry_rcu (d, &server->delegations, super_list) {
+ if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags))
+ continue;
+ nfs_mark_return_delegation(server, d);
+ clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags);
+ ret = true;
+ }
+ return ret;
+}
+
+static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ bool ret = false;
+
+ if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state))
+ goto out;
+ rcu_read_lock();
+ list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) {
+ if (nfs_server_clear_delayed_delegations(server))
+ ret = true;
+ }
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
/**
* nfs_client_return_marked_delegations - return previously marked delegations
* @clp: nfs_client to process
@@ -659,8 +708,14 @@ out:
*/
int nfs_client_return_marked_delegations(struct nfs_client *clp)
{
- return nfs_client_for_each_server(clp,
- nfs_server_return_marked_delegations, NULL);
+ int err = nfs_client_for_each_server(
+ clp, nfs_server_return_marked_delegations, NULL);
+ if (err)
+ return err;
+ /* If a return was delayed, sleep to prevent hard looping */
+ if (nfs_client_clear_delayed_delegations(clp))
+ ssleep(1);
+ return 0;
}
/**
@@ -698,13 +753,14 @@ int nfs4_inode_return_delegation(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
- int err = 0;
- nfs_wb_all(inode);
delegation = nfs_start_delegation_return(nfsi);
+ /* Synchronous recall of any application leases */
+ break_lease(inode, O_WRONLY | O_RDWR);
+ nfs_wb_all(inode);
if (delegation != NULL)
- err = nfs_end_delegation_return(inode, delegation, 1);
- return err;
+ return nfs_end_delegation_return(inode, delegation, 1);
+ return 0;
}
/**
@@ -775,13 +831,6 @@ static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
}
-static void nfs_mark_return_delegation(struct nfs_server *server,
- struct nfs_delegation *delegation)
-{
- set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
- set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
-}
-
static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
{
struct nfs_delegation *delegation;
@@ -1010,6 +1059,9 @@ int nfs_async_inode_return_delegation(struct inode *inode,
nfs_mark_return_delegation(server, delegation);
rcu_read_unlock();
+ /* If there are any application leases or delegations, recall them */
+ break_lease(inode, O_WRONLY | O_RDWR | O_NONBLOCK);
+
nfs_delegation_run_state_manager(clp);
return 0;
out_enoent:
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index c19b4fd20781..1c378992b7c0 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -36,6 +36,7 @@ enum {
NFS_DELEGATION_REVOKED,
NFS_DELEGATION_TEST_EXPIRED,
NFS_DELEGATION_INODE_FREEING,
+ NFS_DELEGATION_RETURN_DELAYED,
};
int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2d30a4da49fa..2e894fec036b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -700,8 +700,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
struct nfs_commit_info cinfo;
- bool request_commit = false;
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ int flags = NFS_ODIRECT_DONE;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
@@ -713,15 +713,9 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
nfs_direct_count_bytes(dreq, hdr);
if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
- switch (dreq->flags) {
- case 0:
+ if (!dreq->flags)
dreq->flags = NFS_ODIRECT_DO_COMMIT;
- request_commit = true;
- break;
- case NFS_ODIRECT_RESCHED_WRITES:
- case NFS_ODIRECT_DO_COMMIT:
- request_commit = true;
- }
+ flags = dreq->flags;
}
spin_unlock(&dreq->lock);
@@ -729,12 +723,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
- if (request_commit) {
+ if (flags == NFS_ODIRECT_DO_COMMIT) {
kref_get(&req->wb_kref);
memcpy(&req->wb_verf, &hdr->verf.verifier,
sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo,
hdr->ds_commit_idx);
+ } else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
+ kref_get(&req->wb_kref);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
}
nfs_unlock_and_release_request(req);
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c4c021c6ebbd..d743629e05e1 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -385,12 +385,15 @@ static void nfs_readpage_from_fscache_complete(struct page *page,
"NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
page, context, error);
- /* if the read completes with an error, we just unlock the page and let
- * the VM reissue the readpage */
- if (!error) {
+ /*
+ * If the read completes with an error, mark the page with PG_checked,
+ * unlock the page, and let the VM reissue the readpage.
+ */
+ if (!error)
SetPageUptodate(page);
- unlock_page(page);
- }
+ else
+ SetPageChecked(page);
+ unlock_page(page);
}
/*
@@ -405,6 +408,11 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
"NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
nfs_i_fscache(inode), page, page->index, page->flags, inode);
+ if (PageChecked(page)) {
+ ClearPageChecked(page);
+ return 1;
+ }
+
ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
page,
nfs_readpage_from_fscache_complete,
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index aaeeb4659bff..59355c106ece 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -67,7 +67,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
int nfs_get_root(struct super_block *s, struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
- struct nfs_server *server = NFS_SB(s);
+ struct nfs_server *server = NFS_SB(s), *clone_server;
struct nfs_fsinfo fsinfo;
struct dentry *root;
struct inode *inode;
@@ -127,7 +127,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
}
spin_unlock(&root->d_lock);
fc->root = root;
- if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ if (server->caps & NFS_CAP_SECURITY_LABEL)
kflags |= SECURITY_LSM_NATIVE_LABELS;
if (ctx->clone_data.sb) {
if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
@@ -137,15 +137,19 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
/* clone lsm security options from the parent to the new sb */
error = security_sb_clone_mnt_opts(ctx->clone_data.sb,
s, kflags, &kflags_out);
+ if (error)
+ goto error_splat_root;
+ clone_server = NFS_SB(ctx->clone_data.sb);
+ server->has_sec_mnt_opts = clone_server->has_sec_mnt_opts;
} else {
error = security_sb_set_mnt_opts(s, fc->security,
kflags, &kflags_out);
}
if (error)
goto error_splat_root;
- if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ if (server->caps & NFS_CAP_SECURITY_LABEL &&
!(kflags_out & SECURITY_LSM_NATIVE_LABELS))
- NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+ server->caps &= ~NFS_CAP_SECURITY_LABEL;
nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label);
error = 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 529c4099f482..853213b3a209 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1101,6 +1101,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
{
filp->private_data = get_nfs_open_context(ctx);
+ set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
if (list_empty(&ctx->list))
nfs_inode_attach_open_context(ctx);
}
@@ -1120,6 +1121,8 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct
continue;
if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
continue;
+ if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags))
+ continue;
ctx = get_nfs_open_context(pos);
if (ctx)
break;
@@ -1135,6 +1138,7 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
/*
* We fatal error on write before. Try to writeback
* every page again.
@@ -2055,35 +2059,33 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
| NFS_INO_INVALID_OTHER;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
+ attr_changed = true;
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id,
inode->i_ino);
} else if (!have_delegation)
nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
inode_set_iversion_raw(inode, fattr->change_attr);
- attr_changed = true;
}
} else {
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_CHANGE;
- cache_revalidated = false;
+ if (!have_delegation ||
+ (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0)
+ cache_revalidated = false;
}
- if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
- } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) {
+ else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_MTIME;
- cache_revalidated = false;
- }
- if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
- } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) {
+ else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_CTIME;
- cache_revalidated = false;
- }
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -2096,7 +2098,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
i_size_write(inode, new_isize);
if (!have_writers)
invalid |= NFS_INO_INVALID_DATA;
- attr_changed = true;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
@@ -2111,19 +2112,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
fattr->du.nfs3.used = 0;
fattr->valid |= NFS_ATTR_FATTR_SPACE_USED;
}
- } else {
+ } else
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_SIZE;
- cache_revalidated = false;
- }
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
- else if (fattr_supported & NFS_ATTR_FATTR_ATIME) {
+ else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_ATIME;
- cache_revalidated = false;
- }
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -2132,71 +2129,55 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_mode = newmode;
invalid |= NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL;
- attr_changed = true;
}
- } else if (fattr_supported & NFS_ATTR_FATTR_MODE) {
+ } else if (fattr_supported & NFS_ATTR_FATTR_MODE)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_MODE;
- cache_revalidated = false;
- }
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
- attr_changed = true;
}
- } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) {
+ } else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_OTHER;
- cache_revalidated = false;
- }
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
- attr_changed = true;
}
- } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) {
+ } else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_OTHER;
- cache_revalidated = false;
- }
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
- attr_changed = true;
}
- } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) {
+ } else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_NLINK;
- cache_revalidated = false;
- }
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) {
+ } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_BLOCKS;
- cache_revalidated = false;
- }
- if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) {
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
- } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) {
+ else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_BLOCKS;
- cache_revalidated = false;
- }
/* Update attrtimeo value if we're out of the unstable period */
if (attr_changed) {
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 5c4e23abc345..2299446b3b89 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -385,7 +385,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
break;
case NFS3_CREATE_UNCHECKED:
- goto out;
+ goto out_release_acls;
}
nfs_fattr_init(data->res.dir_attr);
nfs_fattr_init(data->res.fattr);
@@ -751,7 +751,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
break;
default:
status = -EINVAL;
- goto out;
+ goto out_release_acls;
}
d_alias = nfs3_do_create(dir, dentry, data);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 543d916f79ab..ba78df4b13d9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,6 +45,7 @@ enum nfs4_client_state {
NFS4CLNT_RECALL_RUNNING,
NFS4CLNT_RECALL_ANY_LAYOUT_READ,
NFS4CLNT_RECALL_ANY_LAYOUT_RW,
+ NFS4CLNT_DELEGRETURN_DELAYED,
};
#define NFS4_RENEW_TIMEOUT 0x01
@@ -322,7 +323,8 @@ extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
const nfs4_stateid *deleg_stateid,
fmode_t fmode);
-
+extern int nfs4_proc_setlease(struct file *file, long arg,
+ struct file_lock **lease, void **priv);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
extern void nfs4_update_changeattr(struct inode *dir,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 42719384e25f..28431acd1230 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -197,8 +197,11 @@ void nfs40_shutdown_client(struct nfs_client *clp)
struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
{
- int err;
+ char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
struct nfs_client *clp = nfs_alloc_client(cl_init);
+ int err;
+
if (IS_ERR(clp))
return clp;
@@ -222,6 +225,44 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
init_waitqueue_head(&clp->cl_lock_waitq);
#endif
INIT_LIST_HEAD(&clp->pending_cb_stateids);
+
+ if (cl_init->minorversion != 0)
+ __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
+ __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+ __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+
+ /*
+ * Set up the connection to the server before we add add to the
+ * global list.
+ */
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
+ if (err == -EINVAL)
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
+ if (err < 0)
+ goto error;
+
+ /* If no clientaddr= option was specified, find a usable cb address */
+ if (ip_addr == NULL) {
+ struct sockaddr_storage cb_addr;
+ struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+
+ err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+ if (err < 0)
+ goto error;
+ err = rpc_ntop(sap, buf, sizeof(buf));
+ if (err < 0)
+ goto error;
+ ip_addr = (const char *)buf;
+ }
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+ err = nfs_idmap_new(clp);
+ if (err < 0) {
+ dprintk("%s: failed to create idmapper. Error = %d\n",
+ __func__, err);
+ goto error;
+ }
+ __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
return clp;
error:
@@ -372,8 +413,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *cl_init)
{
- char buf[INET6_ADDRSTRLEN + 1];
- const char *ip_addr = cl_init->ip_addr;
struct nfs_client *old;
int error;
@@ -381,43 +420,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
/* the client is initialised already */
return clp;
- /* Check NFS protocol revision and initialize RPC op vector */
- clp->rpc_ops = &nfs_v4_clientops;
-
- if (clp->cl_minorversion != 0)
- __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
- __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
- __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
-
- error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
- if (error == -EINVAL)
- error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
- if (error < 0)
- goto error;
-
- /* If no clientaddr= option was specified, find a usable cb address */
- if (ip_addr == NULL) {
- struct sockaddr_storage cb_addr;
- struct sockaddr *sap = (struct sockaddr *)&cb_addr;
-
- error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
- if (error < 0)
- goto error;
- error = rpc_ntop(sap, buf, sizeof(buf));
- if (error < 0)
- goto error;
- ip_addr = (const char *)buf;
- }
- strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
-
- error = nfs_idmap_new(clp);
- if (error < 0) {
- dprintk("%s: failed to create idmapper. Error = %d\n",
- __func__, error);
- goto error;
- }
- __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
-
error = nfs4_init_client_minor_version(clp);
if (error < 0)
goto error;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index a1e5c6b85ded..c820de58a661 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -435,6 +435,12 @@ void nfs42_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
+static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease,
+ void **priv)
+{
+ return nfs4_proc_setlease(file, arg, lease, priv);
+}
+
const struct file_operations nfs4_file_operations = {
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
@@ -448,7 +454,7 @@ const struct file_operations nfs4_file_operations = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
- .setlease = simple_nosetlease,
+ .setlease = nfs4_setlease,
#ifdef CONFIG_NFS_V4_2
.copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e653654c10bc..e1214bb6b7ee 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1155,7 +1155,11 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res)
{
- return nfs4_do_call_sync(clnt, server, msg, args, res, 0);
+ unsigned short task_flags = 0;
+
+ if (server->nfs_client->cl_minorversion)
+ task_flags = RPC_TASK_MOVEABLE;
+ return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags);
}
@@ -1205,12 +1209,12 @@ nfs4_update_changeattr_locked(struct inode *inode,
u64 change_attr = inode_peek_iversion_raw(inode);
cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
+ if (S_ISDIR(inode->i_mode))
+ cache_validity |= NFS_INO_INVALID_DATA;
switch (NFS_SERVER(inode)->change_attr_type) {
case NFS4_CHANGE_TYPE_IS_UNDEFINED:
- break;
- case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
- if ((s64)(change_attr - cinfo->after) > 0)
+ if (cinfo->after == change_attr)
goto out;
break;
default:
@@ -1218,24 +1222,21 @@ nfs4_update_changeattr_locked(struct inode *inode,
goto out;
}
- if (cinfo->atomic && cinfo->before == change_attr) {
- nfsi->attrtimeo_timestamp = jiffies;
- } else {
- if (S_ISDIR(inode->i_mode)) {
- cache_validity |= NFS_INO_INVALID_DATA;
+ inode_set_iversion_raw(inode, cinfo->after);
+ if (!cinfo->atomic || cinfo->before != change_attr) {
+ if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
- } else {
- if (!NFS_PROTO(inode)->have_delegation(inode,
- FMODE_READ))
- cache_validity |= NFS_INO_REVAL_PAGECACHE;
- }
- if (cinfo->before != change_attr)
- cache_validity |= NFS_INO_INVALID_ACCESS |
- NFS_INO_INVALID_ACL |
- NFS_INO_INVALID_XATTR;
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ cache_validity |=
+ NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
+ NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR |
+ NFS_INO_REVAL_PAGECACHE;
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
}
- inode_set_iversion_raw(inode, cinfo->after);
+ nfsi->attrtimeo_timestamp = jiffies;
nfsi->read_cache_jiffies = timestamp;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
@@ -2569,6 +2570,9 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
};
int status;
+ if (server->nfs_client->cl_minorversion)
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
kref_get(&data->kref);
data->rpc_done = false;
data->rpc_status = 0;
@@ -3749,6 +3753,9 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
};
int status = -ENOMEM;
+ if (server->nfs_client->cl_minorversion)
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
&task_setup_data.rpc_client, &msg);
@@ -4188,6 +4195,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
};
unsigned short task_flags = 0;
+ if (nfs4_has_session(server->nfs_client))
+ task_flags = RPC_TASK_MOVEABLE;
+
/* Is this is an attribute revalidation, subject to softreval? */
if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
task_flags |= RPC_TASK_TIMEOUT;
@@ -4307,6 +4317,9 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
};
unsigned short task_flags = 0;
+ if (server->nfs_client->cl_minorversion)
+ task_flags = RPC_TASK_MOVEABLE;
+
/* Is this is an attribute revalidation, subject to softreval? */
if (nfs_lookup_is_soft_revalidate(dentry))
task_flags |= RPC_TASK_TIMEOUT;
@@ -6538,7 +6551,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
};
int status = 0;
@@ -6856,6 +6869,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
};
+ struct nfs_client *client =
+ NFS_SERVER(lsp->ls_state->inode)->nfs_client;
+
+ if (client->cl_minorversion)
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
@@ -7130,6 +7148,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int ret;
+ struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client;
+
+ if (client->cl_minorversion)
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
dprintk("%s: begin!\n", __func__);
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
@@ -7438,6 +7460,43 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
return nfs4_retry_setlk(state, cmd, request);
}
+static int nfs4_delete_lease(struct file *file, void **priv)
+{
+ return generic_setlease(file, F_UNLCK, NULL, priv);
+}
+
+static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease,
+ void **priv)
+{
+ struct inode *inode = file_inode(file);
+ fmode_t type = arg == F_RDLCK ? FMODE_READ : FMODE_WRITE;
+ int ret;
+
+ /* No delegation, no lease */
+ if (!nfs4_have_delegation(inode, type))
+ return -EAGAIN;
+ ret = generic_setlease(file, arg, lease, priv);
+ if (ret || nfs4_have_delegation(inode, type))
+ return ret;
+ /* We raced with a delegation return */
+ nfs4_delete_lease(file, priv);
+ return -EAGAIN;
+}
+
+int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease,
+ void **priv)
+{
+ switch (arg) {
+ case F_RDLCK:
+ case F_WRLCK:
+ return nfs4_add_lease(file, arg, lease, priv);
+ case F_UNLCK:
+ return nfs4_delete_lease(file, priv);
+ default:
+ return -EINVAL;
+ }
+}
+
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
{
struct nfs_server *server = NFS_SERVER(state->inode);
@@ -9186,7 +9245,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
.rpc_client = clp->cl_rpcclient,
.rpc_message = &msg,
.callback_ops = &nfs41_sequence_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
};
struct rpc_task *ret;
@@ -9385,7 +9444,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
{
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
- struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_hdr *lo = lgp->lo;
int nfs4err = task->tk_status;
int err, status = 0;
LIST_HEAD(head);
@@ -9437,7 +9496,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
case -NFS4ERR_BAD_STATEID:
exception->timeout = 0;
spin_lock(&inode->i_lock);
- lo = NFS_I(inode)->layout;
/* If the open stateid was bad, then recover it. */
if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
!nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
@@ -9509,7 +9567,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutget_call_ops,
.callback_data = lgp,
- .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF |
+ RPC_TASK_MOVEABLE,
};
struct pnfs_layout_segment *lseg = NULL;
struct nfs4_exception exception = {
@@ -9520,9 +9579,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
dprintk("--> %s\n", __func__);
- /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
- pnfs_get_layout_hdr(NFS_I(inode)->layout);
-
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
task = rpc_run_task(&task_setup_data);
@@ -9650,6 +9706,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
+ .flags = RPC_TASK_MOVEABLE,
};
int status = 0;
@@ -9804,6 +9861,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutcommit_ops,
.callback_data = data,
+ .flags = RPC_TASK_MOVEABLE,
};
struct rpc_task *task;
int status = 0;
@@ -10131,7 +10189,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs41_free_stateid_ops,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
};
struct nfs_free_stateid_data *data;
struct rpc_task *task;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2ef75caad6da..7a2567aa2b86 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -625,7 +625,7 @@ TRACE_EVENT(nfs4_state_mgr,
TP_fast_assign(
__entry->state = clp->cl_state;
- __assign_str(hostname, clp->cl_hostname)
+ __assign_str(hostname, clp->cl_hostname);
),
TP_printk(
@@ -1637,7 +1637,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
),
TP_printk(
@@ -1694,7 +1694,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
__entry->stateid_seq =
be32_to_cpu(stateid->seqid);
__entry->stateid_hash =
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index ccef43e02b48..8a224871be74 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -280,8 +280,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN);
TRACE_DEFINE_ENUM(LOOKUP_CREATE);
TRACE_DEFINE_ENUM(LOOKUP_EXCL);
TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
-TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
-TRACE_DEFINE_ENUM(LOOKUP_ROOT);
TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
TRACE_DEFINE_ENUM(LOOKUP_DOWN);
@@ -297,8 +295,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN);
{ LOOKUP_CREATE, "CREATE" }, \
{ LOOKUP_EXCL, "EXCL" }, \
{ LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
- { LOOKUP_JUMPED, "JUMPED" }, \
- { LOOKUP_ROOT, "ROOT" }, \
{ LOOKUP_EMPTY, "EMPTY" }, \
{ LOOKUP_DOWN, "DOWN" })
@@ -1427,8 +1423,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event,
__entry->version = task->tk_client->cl_vers;
__entry->error = error;
__assign_str(program,
- task->tk_client->cl_program->name)
- __assign_str(procedure, task->tk_msg.rpc_proc->p_name)
+ task->tk_client->cl_program->name);
+ __assign_str(procedure, task->tk_msg.rpc_proc->p_name);
),
TP_printk(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index cf9cc62ec48e..cc232d1f16f2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -954,6 +954,7 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
struct nfs_pgio_header *hdr;
int ret;
+ unsigned short task_flags = 0;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
@@ -962,14 +963,17 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
- if (ret == 0)
+ if (ret == 0) {
+ if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion)
+ task_flags = RPC_TASK_MOVEABLE;
ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
hdr,
hdr->cred,
NFS_PROTO(hdr->inode),
desc->pg_rpc_callops,
desc->pg_ioflags,
- RPC_TASK_CRED_NOREF);
+ RPC_TASK_CRED_NOREF | task_flags);
+ }
return ret;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2c01ee805306..ef14ea0b6ab8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -966,10 +966,8 @@ void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
const struct cred *cred, bool update_barrier)
{
- u32 oldseq, newseq, new_barrier = 0;
-
- oldseq = be32_to_cpu(lo->plh_stateid.seqid);
- newseq = be32_to_cpu(new->seqid);
+ u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ u32 newseq = be32_to_cpu(new->seqid);
if (!pnfs_layout_is_valid(lo)) {
pnfs_set_layout_cred(lo, cred);
@@ -979,19 +977,21 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
return;
}
- if (pnfs_seqid_is_newer(newseq, oldseq)) {
+
+ if (pnfs_seqid_is_newer(newseq, oldseq))
nfs4_stateid_copy(&lo->plh_stateid, new);
- /*
- * Because of wraparound, we want to keep the barrier
- * "close" to the current seqids.
- */
- new_barrier = newseq - atomic_read(&lo->plh_outstanding);
- }
- if (update_barrier)
- new_barrier = be32_to_cpu(new->seqid);
- else if (new_barrier == 0)
+
+ if (update_barrier) {
+ pnfs_barrier_update(lo, newseq);
return;
- pnfs_barrier_update(lo, new_barrier);
+ }
+ /*
+ * Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids. We really only want to
+ * get here from a layoutget call.
+ */
+ if (atomic_read(&lo->plh_outstanding) == 1)
+ pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid));
}
static bool
@@ -1128,8 +1128,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
nfs4_free_pages(lgp->args.layout.pages, max_pages);
- if (lgp->args.inode)
- pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
+ pnfs_put_layout_hdr(lgp->lo);
put_nfs_open_context(lgp->args.ctx);
kfree(lgp);
}
@@ -2014,7 +2013,7 @@ lookup_again:
* If the layout segment list is empty, but there are outstanding
* layoutget calls, then they might be subject to a layoutrecall.
*/
- if (list_empty(&lo->plh_segs) &&
+ if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
@@ -2124,6 +2123,9 @@ lookup_again:
goto out_put_layout_hdr;
}
+ lgp->lo = lo;
+ pnfs_get_layout_hdr(lo);
+
lseg = nfs4_proc_layoutget(lgp, &timeout);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
@@ -2255,6 +2257,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
pnfs_put_layout_hdr(lo);
return;
}
+ lgp->lo = lo;
data->lgp = lgp;
data->o_arg.lg_args = &lgp->args;
data->o_res.lg_res = &lgp->res;
@@ -2263,6 +2266,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
static void _lgopen_prepare_floating(struct nfs4_opendata *data,
struct nfs_open_context *ctx)
{
+ struct inode *ino = data->dentry->d_inode;
struct pnfs_layout_range rng = {
.iomode = (data->o_arg.fmode & FMODE_WRITE) ?
IOMODE_RW: IOMODE_READ,
@@ -2271,7 +2275,7 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data,
};
struct nfs4_layoutget *lgp;
- lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, &current_stateid,
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
&rng, GFP_KERNEL);
if (!lgp)
return;
@@ -2291,6 +2295,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data,
/* Could check on max_ops, but currently hardcoded high enough */
if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
return;
+ if (data->lgp)
+ return;
if (data->state)
_lgopen_prepare_attached(data, ctx);
else
@@ -2330,13 +2336,13 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
}
return;
}
- if (!lgp->args.inode) {
+ if (!lgp->lo) {
lo = _pnfs_grab_empty_layout(ino, ctx);
if (!lo)
return;
- lgp->args.inode = ino;
+ lgp->lo = lo;
} else
- lo = NFS_I(lgp->args.inode)->layout;
+ lo = lgp->lo;
lseg = pnfs_layout_process(lgp);
if (!IS_ERR(lseg)) {
@@ -2349,11 +2355,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
{
if (lgp != NULL) {
- struct inode *inode = lgp->args.inode;
- if (inode) {
- struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- pnfs_clear_first_layoutget(lo);
- nfs_layoutget_end(lo);
+ if (lgp->lo) {
+ pnfs_clear_first_layoutget(lgp->lo);
+ nfs_layoutget_end(lgp->lo);
}
pnfs_layoutget_free(lgp);
}
@@ -2362,7 +2366,7 @@ void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
struct pnfs_layout_segment *
pnfs_layout_process(struct nfs4_layoutget *lgp)
{
- struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+ struct pnfs_layout_hdr *lo = lgp->lo;
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
@@ -2390,11 +2394,13 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
+ if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo))
+ goto out_forget;
+
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
- if (!pnfs_layout_is_valid(lo) &&
- pnfs_is_first_layoutget(lo))
+ if (!pnfs_layout_is_valid(lo))
lo->plh_barrier = 0;
dprintk("%s forget reply due to sequence\n", __func__);
goto out_forget;
@@ -2413,8 +2419,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
} else {
/* We have a completely new layout */
- if (!pnfs_is_first_layoutget(lo))
- goto out_forget;
pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 49d3389bd813..cf19914fec81 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -805,19 +805,16 @@ out:
}
EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
-static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
{
might_sleep();
- wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
- TASK_KILLABLE);
+ return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE);
}
static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
smp_mb__before_atomic();
- clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
- smp_mb__after_atomic();
- wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+ clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state);
}
static struct nfs_client *(*get_v3_ds_connect)(
@@ -858,7 +855,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
if (!load_v3_ds_connect())
- goto out;
+ return -EPROTONOSUPPORT;
list_for_each_entry(da, &ds->ds_addrs, da_node) {
dprintk("%s: DS %s: trying address %s\n",
@@ -993,30 +990,33 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
{
int err;
-again:
- err = 0;
- if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
- if (version == 3) {
- err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
- retrans);
- } else if (version == 4) {
- err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
- retrans, minor_version);
- } else {
- dprintk("%s: unsupported DS version %d\n", __func__,
- version);
- err = -EPROTONOSUPPORT;
- }
+ do {
+ err = nfs4_wait_ds_connect(ds);
+ if (err || ds->ds_clp)
+ goto out;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return -ENODEV;
+ } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0);
- nfs4_clear_ds_conn_bit(ds);
- } else {
- nfs4_wait_ds_connect(ds);
+ if (ds->ds_clp)
+ goto connect_done;
- /* what was waited on didn't connect AND didn't mark unavail */
- if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid))
- goto again;
+ switch (version) {
+ case 3:
+ err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans);
+ break;
+ case 4:
+ err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans,
+ minor_version);
+ break;
+ default:
+ dprintk("%s: unsupported DS version %d\n", __func__, version);
+ err = -EPROTONOSUPPORT;
}
+connect_done:
+ nfs4_clear_ds_conn_bit(ds);
+out:
/*
* At this point the ds->ds_clp should be ready, but it might have
* hit an error.
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index d2b6dce1f99f..9f39e0a1a38b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -74,8 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
-static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio,
- struct inode *inode)
+static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *pgm;
unsigned long npages;
@@ -86,9 +85,9 @@ static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio,
WARN_ON_ONCE(pgio->pg_mirror_count != 1);
pgm = &pgio->pg_mirrors[0];
- NFS_I(inode)->read_io += pgm->pg_bytes_written;
+ NFS_I(pgio->pg_inode)->read_io += pgm->pg_bytes_written;
npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
- nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+ nfs_add_stats(pgio->pg_inode, NFSIOS_READPAGES, npages);
}
@@ -363,22 +362,23 @@ int nfs_readpage(struct file *file, struct page *page)
} else
desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
+ xchg(&desc.ctx->error, 0);
if (!IS_SYNC(inode)) {
ret = nfs_readpage_from_fscache(desc.ctx, inode, page);
if (ret == 0)
- goto out;
+ goto out_wait;
}
- xchg(&desc.ctx->error, 0);
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
ret = readpage_async_filler(&desc, page);
+ if (ret)
+ goto out;
- if (!ret)
- nfs_pageio_complete_read(&desc.pgio, inode);
-
+ nfs_pageio_complete_read(&desc.pgio);
ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
+out_wait:
if (!ret) {
ret = wait_on_page_locked_killable(page);
if (!PageUptodate(page) && !ret)
@@ -430,7 +430,7 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
- nfs_pageio_complete_read(&desc.pgio, inode);
+ nfs_pageio_complete_read(&desc.pgio);
read_complete:
put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3bf82178166a..eae9bf114041 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1810,6 +1810,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
struct nfs_commit_info *cinfo)
{
struct nfs_commit_data *data;
+ unsigned short task_flags = 0;
/* another commit raced with us */
if (list_empty(head))
@@ -1820,8 +1821,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
atomic_inc(&cinfo->mds->rpcs_out);
+ if (NFS_SERVER(inode)->nfs_client->cl_minorversion)
+ task_flags = RPC_TASK_MOVEABLE;
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
- data->mds_ops, how, RPC_TASK_CRED_NOREF);
+ data->mds_ops, how,
+ RPC_TASK_CRED_NOREF | task_flags);
}
/*
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index 26f2a50eceac..edec45831585 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -82,6 +82,7 @@ __state_in_grace(struct net *net, bool open)
/**
* locks_in_grace
+ * @net: network namespace
*
* Lock managers call this function to determine when it is OK for them
* to answer ordinary lock requests, and when they should accept only
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 1058659a8d31..c99dee99a3c1 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -236,7 +236,7 @@ again:
if (!buf)
return -ENOMEM;
- rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
+ rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
if (IS_ERR(rq)) {
error = -ENOMEM;
goto out_free_buf;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index a75abeb1e698..935c1028c217 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -176,6 +176,12 @@ struct nfsd_net {
unsigned int longest_chain_cachesize;
struct shrinker nfsd_reply_cache_shrinker;
+
+ /* tracking server-to-server copy mounts */
+ spinlock_t nfsd_ssc_lock;
+ struct list_head nfsd_ssc_mount_list;
+ wait_queue_head_t nfsd_ssc_waitq;
+
/* utsname taken from the process that starts the server */
char nfsd_name[UNX_MAXNODENAME+1];
};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index a1591feeea22..5dfe7644a517 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -172,7 +172,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct kvec *head = rqstp->rq_res.head;
- struct inode *inode = d_inode(dentry);
+ struct inode *inode;
unsigned int base;
int n;
int w;
@@ -181,6 +181,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
return 0;
switch (resp->status) {
case nfs_ok:
+ inode = d_inode(dentry);
if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
return 0;
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7325592b456e..0f8b10f363e7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -915,10 +915,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.authflavor = clp->cl_cred.cr_flavor;
clp->cl_cb_ident = conn->cb_ident;
} else {
- if (!conn->cb_xprt) {
- trace_nfsd_cb_setup_err(clp, -EINVAL);
+ if (!conn->cb_xprt)
return -EINVAL;
- }
clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
@@ -941,37 +939,43 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
}
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
- trace_nfsd_cb_setup(clp);
+ rcu_read_lock();
+ trace_nfsd_cb_setup(clp, rpc_peeraddr2str(client, RPC_DISPLAY_NETID),
+ args.authflavor);
+ rcu_read_unlock();
return 0;
}
+static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
+{
+ if (clp->cl_cb_state != newstate) {
+ clp->cl_cb_state = newstate;
+ trace_nfsd_cb_state(clp);
+ }
+}
+
static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
{
if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
return;
- clp->cl_cb_state = NFSD4_CB_DOWN;
- trace_nfsd_cb_state(clp);
+ nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN);
}
static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
{
if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
return;
- clp->cl_cb_state = NFSD4_CB_FAULT;
- trace_nfsd_cb_state(clp);
+ nfsd4_mark_cb_state(clp, NFSD4_CB_FAULT);
}
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
- trace_nfsd_cb_done(clp, task->tk_status);
if (task->tk_status)
nfsd4_mark_cb_down(clp, task->tk_status);
- else {
- clp->cl_cb_state = NFSD4_CB_UP;
- trace_nfsd_cb_state(clp);
- }
+ else
+ nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
}
static void nfsd4_cb_probe_release(void *calldata)
@@ -995,8 +999,8 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
*/
void nfsd4_probe_callback(struct nfs4_client *clp)
{
- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
- trace_nfsd_cb_state(clp);
+ trace_nfsd_cb_probe(clp);
+ nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN);
set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
nfsd4_run_cb(&clp->cl_cb_null);
}
@@ -1009,11 +1013,10 @@ void nfsd4_probe_callback_sync(struct nfs4_client *clp)
void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
{
- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN);
spin_lock(&clp->cl_lock);
memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
spin_unlock(&clp->cl_lock);
- trace_nfsd_cb_state(clp);
}
/*
@@ -1170,8 +1173,6 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
- trace_nfsd_cb_done(clp, task->tk_status);
-
if (!nfsd4_cb_sequence_done(task, cb))
return;
@@ -1231,6 +1232,9 @@ void nfsd4_destroy_callback_queue(void)
/* must be called under the state lock */
void nfsd4_shutdown_callback(struct nfs4_client *clp)
{
+ if (clp->cl_cb_state != NFSD4_CB_UNKNOWN)
+ trace_nfsd_cb_shutdown(clp);
+
set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
/*
* Note this won't actually result in a null callback;
@@ -1276,7 +1280,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
* kill the old client:
*/
if (clp->cl_cb_client) {
- trace_nfsd_cb_shutdown(clp);
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
put_cred(clp->cl_cb_cred);
@@ -1322,8 +1325,6 @@ nfsd4_run_cb_work(struct work_struct *work)
struct rpc_clnt *clnt;
int flags;
- trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name);
-
if (cb->cb_need_restart) {
cb->cb_need_restart = false;
} else {
@@ -1345,7 +1346,7 @@ nfsd4_run_cb_work(struct work_struct *work)
* Don't send probe messages for 4.1 or later.
*/
if (!cb->cb_ops && clp->cl_minorversion) {
- clp->cl_cb_state = NFSD4_CB_UP;
+ nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
nfsd41_destroy_cb(cb);
return;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f4ce93d7f26e..486c5dba4b65 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,6 +55,13 @@ module_param(inter_copy_offload_enable, bool, 0644);
MODULE_PARM_DESC(inter_copy_offload_enable,
"Enable inter server to server copy offload. Default: false");
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+static int nfsd4_ssc_umount_timeout = 900000; /* default to 15 mins */
+module_param(nfsd4_ssc_umount_timeout, int, 0644);
+MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
+ "idle msecs before unmount export from source server");
+#endif
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -1166,6 +1173,81 @@ extern void nfs_sb_deactive(struct super_block *sb);
#define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys"
/*
+ * setup a work entry in the ssc delayed unmount list.
+ */
+static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
+ struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt)
+{
+ struct nfsd4_ssc_umount_item *ni = 0;
+ struct nfsd4_ssc_umount_item *work = NULL;
+ struct nfsd4_ssc_umount_item *tmp;
+ DEFINE_WAIT(wait);
+
+ *ss_mnt = NULL;
+ *retwork = NULL;
+ work = kzalloc(sizeof(*work), GFP_KERNEL);
+try_again:
+ spin_lock(&nn->nfsd_ssc_lock);
+ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+ if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
+ continue;
+ /* found a match */
+ if (ni->nsui_busy) {
+ /* wait - and try again */
+ prepare_to_wait(&nn->nfsd_ssc_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ spin_unlock(&nn->nfsd_ssc_lock);
+
+ /* allow 20secs for mount/unmount for now - revisit */
+ if (signal_pending(current) ||
+ (schedule_timeout(20*HZ) == 0)) {
+ kfree(work);
+ return nfserr_eagain;
+ }
+ finish_wait(&nn->nfsd_ssc_waitq, &wait);
+ goto try_again;
+ }
+ *ss_mnt = ni->nsui_vfsmount;
+ refcount_inc(&ni->nsui_refcnt);
+ spin_unlock(&nn->nfsd_ssc_lock);
+ kfree(work);
+
+ /* return vfsmount in ss_mnt */
+ return 0;
+ }
+ if (work) {
+ strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+ refcount_set(&work->nsui_refcnt, 2);
+ work->nsui_busy = true;
+ list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
+ *retwork = work;
+ }
+ spin_unlock(&nn->nfsd_ssc_lock);
+ return 0;
+}
+
+static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn,
+ struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt)
+{
+ /* set nsui_vfsmount, clear busy flag and wakeup waiters */
+ spin_lock(&nn->nfsd_ssc_lock);
+ work->nsui_vfsmount = ss_mnt;
+ work->nsui_busy = false;
+ wake_up_all(&nn->nfsd_ssc_waitq);
+ spin_unlock(&nn->nfsd_ssc_lock);
+}
+
+static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn,
+ struct nfsd4_ssc_umount_item *work)
+{
+ spin_lock(&nn->nfsd_ssc_lock);
+ list_del(&work->nsui_list);
+ wake_up_all(&nn->nfsd_ssc_waitq);
+ spin_unlock(&nn->nfsd_ssc_lock);
+ kfree(work);
+}
+
+/*
* Support one copy source server for now.
*/
static __be32
@@ -1181,6 +1263,8 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
char *ipaddr, *dev_name, *raw_data;
int len, raw_len;
__be32 status = nfserr_inval;
+ struct nfsd4_ssc_umount_item *work = NULL;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
naddr = &nss->u.nl4_addr;
tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
@@ -1229,12 +1313,24 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
goto out_free_rawdata;
snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
+ status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt);
+ if (status)
+ goto out_free_devname;
+ if (ss_mnt)
+ goto out_done;
+
/* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
module_put(type->owner);
- if (IS_ERR(ss_mnt))
+ if (IS_ERR(ss_mnt)) {
+ status = nfserr_nodev;
+ if (work)
+ nfsd4_ssc_cancel_dul_work(nn, work);
goto out_free_devname;
-
+ }
+ if (work)
+ nfsd4_ssc_update_dul_work(nn, work, ss_mnt);
+out_done:
status = 0;
*mount = ss_mnt;
@@ -1301,10 +1397,42 @@ static void
nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
struct nfsd_file *dst)
{
+ bool found = false;
+ long timeout;
+ struct nfsd4_ssc_umount_item *tmp;
+ struct nfsd4_ssc_umount_item *ni = NULL;
+ struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id);
+
nfs42_ssc_close(src->nf_file);
- fput(src->nf_file);
nfsd_file_put(dst);
- mntput(ss_mnt);
+ fput(src->nf_file);
+
+ if (!nn) {
+ mntput(ss_mnt);
+ return;
+ }
+ spin_lock(&nn->nfsd_ssc_lock);
+ timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
+ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+ if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
+ list_del(&ni->nsui_list);
+ /*
+ * vfsmount can be shared by multiple exports,
+ * decrement refcnt. If the count drops to 1 it
+ * will be unmounted when nsui_expire expires.
+ */
+ refcount_dec(&ni->nsui_refcnt);
+ ni->nsui_expire = jiffies + timeout;
+ list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&nn->nfsd_ssc_lock);
+ if (!found) {
+ mntput(ss_mnt);
+ return;
+ }
}
#else /* CONFIG_NFSD_V4_2_INTER_SSC */
@@ -1375,7 +1503,8 @@ static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
{
- copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+ copy->cp_res.wr_stable_how =
+ copy->committed ? NFS_FILE_SYNC : NFS_UNSTABLE;
copy->cp_synchronous = sync;
gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
}
@@ -1386,6 +1515,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
u64 bytes_total = copy->cp_count;
u64 src_pos = copy->cp_src_pos;
u64 dst_pos = copy->cp_dst_pos;
+ __be32 status;
/* See RFC 7862 p.67: */
if (bytes_total == 0)
@@ -1403,6 +1533,16 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
src_pos += bytes_copied;
dst_pos += bytes_copied;
} while (bytes_total > 0 && !copy->cp_synchronous);
+ /* for a non-zero asynchronous copy do a commit of data */
+ if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) {
+ down_write(&copy->nf_dst->nf_rwsem);
+ status = vfs_fsync_range(copy->nf_dst->nf_file,
+ copy->cp_dst_pos,
+ copy->cp_res.wr_bytes_written, 0);
+ up_write(&copy->nf_dst->nf_rwsem);
+ if (!status)
+ copy->committed = true;
+ }
return bytes_copied;
}
@@ -1497,6 +1637,8 @@ do_callback:
memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
&nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
+ trace_nfsd_cb_offload(copy->cp_clp, &copy->cp_res.cb_stateid,
+ &copy->fh, copy->cp_count, copy->nfserr);
nfsd4_run_cb(&cb_copy->cp_cb);
out:
if (!copy->cp_intra)
@@ -3232,7 +3374,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
struct nfsd4_compoundargs *argp = rqstp->rq_argp;
- struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_op *this;
struct nfsd4_compound_state *cstate = &resp->cstate;
struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
u32 opiter;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b517a8794400..fa67ecd5fe63 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,6 +44,7 @@
#include <linux/jhash.h>
#include <linux/string_helpers.h>
#include <linux/fsnotify.h>
+#include <linux/nfs_ssc.h>
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -1745,6 +1746,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
struct nfs4_client *clp = c->cn_session->se_client;
+ trace_nfsd_cb_lost(clp);
+
spin_lock(&clp->cl_lock);
if (!list_empty(&c->cn_persession)) {
list_del(&c->cn_persession);
@@ -2351,10 +2354,25 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
static void seq_quote_mem(struct seq_file *m, char *data, int len)
{
seq_printf(m, "\"");
- seq_escape_mem_ascii(m, data, len);
+ seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
seq_printf(m, "\"");
}
+static const char *cb_state2str(int state)
+{
+ switch (state) {
+ case NFSD4_CB_UP:
+ return "UP";
+ case NFSD4_CB_UNKNOWN:
+ return "UNKNOWN";
+ case NFSD4_CB_DOWN:
+ return "DOWN";
+ case NFSD4_CB_FAULT:
+ return "FAULT";
+ }
+ return "UNDEFINED";
+}
+
static int client_info_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
@@ -2383,6 +2401,8 @@ static int client_info_show(struct seq_file *m, void *v)
seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
}
+ seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
+ seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
drop_client(clp);
return 0;
@@ -2665,6 +2685,8 @@ static void force_expire_client(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
bool already_expired;
+ trace_nfsd_clid_admin_expired(&clp->cl_clientid);
+
spin_lock(&clp->cl_lock);
clp->cl_time = 0;
spin_unlock(&clp->cl_lock);
@@ -2816,14 +2838,11 @@ move_to_confirmed(struct nfs4_client *clp)
lockdep_assert_held(&nn->client_lock);
- dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
add_clp_to_name_tree(clp, &nn->conf_name_tree);
- if (!test_and_set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags) &&
- clp->cl_nfsd_dentry &&
- clp->cl_nfsd_info_dentry)
- fsnotify_dentry(clp->cl_nfsd_info_dentry, FS_MODIFY);
+ set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+ trace_nfsd_clid_confirmed(&clp->cl_clientid);
renew_client_locked(clp);
}
@@ -3176,20 +3195,24 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
/* case 6 */
exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ trace_nfsd_clid_confirmed_r(conf);
goto out_copy;
}
if (!creds_match) { /* case 3 */
if (client_has_state(conf)) {
status = nfserr_clid_inuse;
+ trace_nfsd_clid_cred_mismatch(conf, rqstp);
goto out;
}
goto out_new;
}
if (verfs_match) { /* case 2 */
conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ trace_nfsd_clid_confirmed_r(conf);
goto out_copy;
}
/* case 5, client reboot */
+ trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf);
conf = NULL;
goto out_new;
}
@@ -3199,16 +3222,19 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
+ unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
if (unconf) /* case 4, possible retry or client restart */
unhash_client_locked(unconf);
- /* case 1 (normal case) */
+ /* case 1, new owner ID */
+ trace_nfsd_clid_fresh(new);
+
out_new:
if (conf) {
status = mark_client_expired_locked(conf);
if (status)
goto out;
+ trace_nfsd_clid_replaced(&conf->cl_clientid);
}
new->cl_minorversion = cstate->minorversion;
new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
@@ -3232,8 +3258,10 @@ out:
out_nolock:
if (new)
expire_client(new);
- if (unconf)
+ if (unconf) {
+ trace_nfsd_clid_expire_unconf(&unconf->cl_clientid);
expire_client(unconf);
+ }
return status;
}
@@ -3425,9 +3453,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
} else if (unconf) {
+ status = nfserr_clid_inuse;
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
!rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
- status = nfserr_clid_inuse;
+ trace_nfsd_clid_cred_mismatch(unconf, rqstp);
goto out_free_conn;
}
status = nfserr_wrong_cred;
@@ -3447,6 +3476,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
old = NULL;
goto out_free_conn;
}
+ trace_nfsd_clid_replaced(&old->cl_clientid);
}
move_to_confirmed(unconf);
conf = unconf;
@@ -3471,6 +3501,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
/* cache solo and embedded create sessions under the client_lock */
nfsd4_cache_create_session(cr_ses, cs_slot, status);
spin_unlock(&nn->client_lock);
+ if (conf == unconf)
+ fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
/* init connection and backchannel */
nfsd4_init_conn(rqstp, conn, new);
nfsd4_put_session(new);
@@ -3904,6 +3936,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp,
status = nfserr_wrong_cred;
goto out;
}
+ trace_nfsd_clid_destroyed(&clp->cl_clientid);
unhash_client_locked(clp);
out:
spin_unlock(&nn->client_lock);
@@ -3946,6 +3979,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
goto out;
status = nfs_ok;
+ trace_nfsd_clid_reclaim_complete(&clp->cl_clientid);
nfsd4_client_record_create(clp);
inc_reclaim_complete(clp);
out:
@@ -3967,27 +4001,29 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
new = create_client(clname, rqstp, &clverifier);
if (new == NULL)
return nfserr_jukebox;
- /* Cases below refer to rfc 3530 section 14.2.33: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&clname, nn);
if (conf && client_has_state(conf)) {
- /* case 0: */
status = nfserr_clid_inuse;
if (clp_used_exchangeid(conf))
goto out;
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
- trace_nfsd_clid_inuse_err(conf);
+ trace_nfsd_clid_cred_mismatch(conf, rqstp);
goto out;
}
}
unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
unhash_client_locked(unconf);
- /* We need to handle only case 1: probable callback update */
- if (conf && same_verf(&conf->cl_verifier, &clverifier)) {
- copy_clid(new, conf);
- gen_confirm(new, nn);
- }
+ if (conf) {
+ if (same_verf(&conf->cl_verifier, &clverifier)) {
+ copy_clid(new, conf);
+ gen_confirm(new, nn);
+ } else
+ trace_nfsd_clid_verf_mismatch(conf, rqstp,
+ &clverifier);
+ } else
+ trace_nfsd_clid_fresh(new);
new->cl_minorversion = 0;
gen_callback(new, setclid, rqstp);
add_to_unconfirmed(new);
@@ -4000,12 +4036,13 @@ out:
spin_unlock(&nn->client_lock);
if (new)
free_client(new);
- if (unconf)
+ if (unconf) {
+ trace_nfsd_clid_expire_unconf(&unconf->cl_clientid);
expire_client(unconf);
+ }
return status;
}
-
__be32
nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -4034,25 +4071,27 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
* Nevertheless, RFC 7530 recommends INUSE for this case:
*/
status = nfserr_clid_inuse;
- if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
+ if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
+ trace_nfsd_clid_cred_mismatch(unconf, rqstp);
goto out;
- if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
+ }
+ if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+ trace_nfsd_clid_cred_mismatch(conf, rqstp);
goto out;
- /* cases below refer to rfc 3530 section 14.2.34: */
+ }
if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
if (conf && same_verf(&confirm, &conf->cl_confirm)) {
- /* case 2: probable retransmit */
status = nfs_ok;
- } else /* case 4: client hasn't noticed we rebooted yet? */
+ } else
status = nfserr_stale_clientid;
goto out;
}
status = nfs_ok;
- if (conf) { /* case 1: callback update */
+ if (conf) {
old = unconf;
unhash_client_locked(old);
nfsd4_change_callback(conf, &unconf->cl_cb_conn);
- } else { /* case 3: normal case; new or rebooted client */
+ } else {
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
status = nfserr_clid_inuse;
@@ -4065,12 +4104,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
old = NULL;
goto out;
}
+ trace_nfsd_clid_replaced(&old->cl_clientid);
}
move_to_confirmed(unconf);
conf = unconf;
}
get_client_locked(conf);
spin_unlock(&nn->client_lock);
+ if (conf == unconf)
+ fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
nfsd4_probe_callback(conf);
spin_lock(&nn->client_lock);
put_client_renew_locked(conf);
@@ -4618,7 +4660,7 @@ nfsd_break_deleg_cb(struct file_lock *fl)
struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
- trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid);
+ trace_nfsd_cb_recall(&dp->dl_stid);
/*
* We don't want the locks code to timeout the lease for us;
@@ -5457,6 +5499,69 @@ static bool state_expired(struct laundry_time *lt, time64_t last_refresh)
return false;
}
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+void nfsd4_ssc_init_umount_work(struct nfsd_net *nn)
+{
+ spin_lock_init(&nn->nfsd_ssc_lock);
+ INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list);
+ init_waitqueue_head(&nn->nfsd_ssc_waitq);
+}
+EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
+
+/*
+ * This is called when nfsd is being shutdown, after all inter_ssc
+ * cleanup were done, to destroy the ssc delayed unmount list.
+ */
+static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn)
+{
+ struct nfsd4_ssc_umount_item *ni = NULL;
+ struct nfsd4_ssc_umount_item *tmp;
+
+ spin_lock(&nn->nfsd_ssc_lock);
+ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+ list_del(&ni->nsui_list);
+ spin_unlock(&nn->nfsd_ssc_lock);
+ mntput(ni->nsui_vfsmount);
+ kfree(ni);
+ spin_lock(&nn->nfsd_ssc_lock);
+ }
+ spin_unlock(&nn->nfsd_ssc_lock);
+}
+
+static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
+{
+ bool do_wakeup = false;
+ struct nfsd4_ssc_umount_item *ni = 0;
+ struct nfsd4_ssc_umount_item *tmp;
+
+ spin_lock(&nn->nfsd_ssc_lock);
+ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+ if (time_after(jiffies, ni->nsui_expire)) {
+ if (refcount_read(&ni->nsui_refcnt) > 1)
+ continue;
+
+ /* mark being unmount */
+ ni->nsui_busy = true;
+ spin_unlock(&nn->nfsd_ssc_lock);
+ mntput(ni->nsui_vfsmount);
+ spin_lock(&nn->nfsd_ssc_lock);
+
+ /* waiters need to start from begin of list */
+ list_del(&ni->nsui_list);
+ kfree(ni);
+
+ /* wakeup ssc_connect waiters */
+ do_wakeup = true;
+ continue;
+ }
+ break;
+ }
+ if (do_wakeup)
+ wake_up_all(&nn->nfsd_ssc_waitq);
+ spin_unlock(&nn->nfsd_ssc_lock);
+}
+#endif
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
@@ -5495,10 +5600,8 @@ nfs4_laundromat(struct nfsd_net *nn)
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (!state_expired(&lt, clp->cl_time))
break;
- if (mark_client_expired_locked(clp)) {
- trace_nfsd_clid_expired(&clp->cl_clientid);
+ if (mark_client_expired_locked(clp))
continue;
- }
list_add(&clp->cl_lru, &reaplist);
}
spin_unlock(&nn->client_lock);
@@ -5568,6 +5671,10 @@ nfs4_laundromat(struct nfsd_net *nn)
list_del_init(&nbl->nbl_lru);
free_blocked_lock(nbl);
}
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ /* service the server-to-server copy delayed unmount list */
+ nfsd4_ssc_expire_umount(nn);
+#endif
out:
return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
}
@@ -6430,8 +6537,10 @@ nfsd4_lm_notify(struct file_lock *fl)
}
spin_unlock(&nn->blocked_locks_lock);
- if (queue)
+ if (queue) {
+ trace_nfsd_cb_notify_lock(lo, nbl);
nfsd4_run_cb(&nbl->nbl_cb);
+ }
}
static const struct lock_manager_operations nfsd_posix_mng_ops = {
@@ -7229,7 +7338,6 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
unsigned int strhashval;
struct nfs4_client_reclaim *crp;
- trace_nfsd_clid_reclaim(nn, name.len, name.data);
crp = alloc_reclaim();
if (crp) {
strhashval = clientstr_hashval(name);
@@ -7279,8 +7387,6 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
unsigned int strhashval;
struct nfs4_client_reclaim *crp = NULL;
- trace_nfsd_clid_find(nn, name.len, name.data);
-
strhashval = clientstr_hashval(name);
list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
if (compare_blob(&crp->cr_name, &name) == 0) {
@@ -7486,6 +7592,9 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ nfsd4_ssc_shutdown_umount(nn);
+#endif
}
void
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 14dbfa75059d..9664303afdaf 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -484,6 +484,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
extern int nfsd4_is_junction(struct dentry *dentry);
extern int register_cld_notifier(void);
extern void unregister_cld_notifier(void);
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
+#endif
+
#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
{
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index aff2cda5c6c3..6106697adc04 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -225,15 +225,12 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
* returns a crc32 hash for the filehandle that is compatible with
* the one displayed by "wireshark".
*/
-
-static inline u32
-knfsd_fh_hash(struct knfsd_fh *fh)
+static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
}
#else
-static inline u32
-knfsd_fh_hash(struct knfsd_fh *fh)
+static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
return 0;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dd5d69921676..ccb59e91011b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -403,6 +403,9 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
if (ret)
goto out_filecache;
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ nfsd4_ssc_init_umount_work(nn);
+#endif
nn->nfsd_net_up = true;
return 0;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 27a93ebd1d80..adaec43548d1 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -408,7 +408,6 @@ TRACE_EVENT(nfsd_dirent,
__entry->ino = ino;
__entry->len = namlen;
memcpy(__get_str(name), name, namlen);
- __assign_str(name, name);
),
TP_printk("fh_hash=0x%08x ino=%llu name=%.*s",
__entry->fh_hash, __entry->ino,
@@ -459,7 +458,6 @@ DEFINE_STATEID_EVENT(layout_recall_release);
DEFINE_STATEID_EVENT(open);
DEFINE_STATEID_EVENT(deleg_read);
-DEFINE_STATEID_EVENT(deleg_break);
DEFINE_STATEID_EVENT(deleg_recall);
DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
@@ -511,7 +509,12 @@ DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \
TP_PROTO(const clientid_t *clid), \
TP_ARGS(clid))
-DEFINE_CLIENTID_EVENT(expired);
+DEFINE_CLIENTID_EVENT(expire_unconf);
+DEFINE_CLIENTID_EVENT(reclaim_complete);
+DEFINE_CLIENTID_EVENT(confirmed);
+DEFINE_CLIENTID_EVENT(destroyed);
+DEFINE_CLIENTID_EVENT(admin_expired);
+DEFINE_CLIENTID_EVENT(replaced);
DEFINE_CLIENTID_EVENT(purged);
DEFINE_CLIENTID_EVENT(renew);
DEFINE_CLIENTID_EVENT(stale);
@@ -536,58 +539,102 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \
DEFINE_NET_EVENT(grace_start);
DEFINE_NET_EVENT(grace_complete);
-DECLARE_EVENT_CLASS(nfsd_clid_class,
- TP_PROTO(const struct nfsd_net *nn,
- unsigned int namelen,
- const unsigned char *namedata),
- TP_ARGS(nn, namelen, namedata),
+TRACE_EVENT(nfsd_clid_cred_mismatch,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct svc_rqst *rqstp
+ ),
+ TP_ARGS(clp, rqstp),
TP_STRUCT__entry(
- __field(unsigned long long, boot_time)
- __field(unsigned int, namelen)
- __dynamic_array(unsigned char, name, namelen)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(unsigned long, cl_flavor)
+ __field(unsigned long, new_flavor)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
- __entry->boot_time = nn->boot_time;
- __entry->namelen = namelen;
- memcpy(__get_dynamic_array(name), namedata, namelen);
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __entry->cl_flavor = clp->cl_cred.cr_flavor;
+ __entry->new_flavor = rqstp->rq_cred.cr_flavor;
+ memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
+ sizeof(struct sockaddr_in6));
),
- TP_printk("boot_time=%16llx nfs4_clientid=%.*s",
- __entry->boot_time, __entry->namelen, __get_str(name))
+ TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc",
+ __entry->cl_boot, __entry->cl_id,
+ show_nfsd_authflavor(__entry->cl_flavor),
+ show_nfsd_authflavor(__entry->new_flavor), __entry->addr
+ )
)
-#define DEFINE_CLID_EVENT(name) \
-DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \
- TP_PROTO(const struct nfsd_net *nn, \
- unsigned int namelen, \
- const unsigned char *namedata), \
- TP_ARGS(nn, namelen, namedata))
-
-DEFINE_CLID_EVENT(find);
-DEFINE_CLID_EVENT(reclaim);
+TRACE_EVENT(nfsd_clid_verf_mismatch,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct svc_rqst *rqstp,
+ const nfs4_verifier *verf
+ ),
+ TP_ARGS(clp, rqstp, verf),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE)
+ __array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->cl_verifier, (void *)&clp->cl_verifier,
+ NFS4_VERIFIER_SIZE);
+ memcpy(__entry->new_verifier, (void *)verf,
+ NFS4_VERIFIER_SIZE);
+ memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc",
+ __entry->cl_boot, __entry->cl_id,
+ __print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE),
+ __print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE),
+ __entry->addr
+ )
+);
-TRACE_EVENT(nfsd_clid_inuse_err,
+DECLARE_EVENT_CLASS(nfsd_clid_class,
TP_PROTO(const struct nfs4_client *clp),
TP_ARGS(clp),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
__array(unsigned char, addr, sizeof(struct sockaddr_in6))
- __field(unsigned int, namelen)
- __dynamic_array(unsigned char, name, clp->cl_name.len)
+ __field(unsigned long, flavor)
+ __array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
+ __dynamic_array(char, name, clp->cl_name.len + 1)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
memcpy(__entry->addr, &clp->cl_addr,
sizeof(struct sockaddr_in6));
- __entry->namelen = clp->cl_name.len;
- memcpy(__get_dynamic_array(name), clp->cl_name.data,
- clp->cl_name.len);
- ),
- TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x",
- __entry->namelen, __get_str(name), __entry->addr,
+ __entry->flavor = clp->cl_cred.cr_flavor;
+ memcpy(__entry->verifier, (void *)&clp->cl_verifier,
+ NFS4_VERIFIER_SIZE);
+ memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len);
+ __get_str(name)[clp->cl_name.len] = '\0';
+ ),
+ TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
+ __entry->addr, __get_str(name),
+ __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE),
+ show_nfsd_authflavor(__entry->flavor),
__entry->cl_boot, __entry->cl_id)
-)
+);
+
+#define DEFINE_CLID_EVENT(name) \
+DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \
+ TP_PROTO(const struct nfs4_client *clp), \
+ TP_ARGS(clp))
+
+DEFINE_CLID_EVENT(fresh);
+DEFINE_CLID_EVENT(confirmed_r);
/*
* from fs/nfsd/filecache.h
@@ -809,9 +856,9 @@ TRACE_EVENT(nfsd_cb_args,
memcpy(__entry->addr, &conn->cb_addr,
sizeof(struct sockaddr_in6));
),
- TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u",
- __entry->cl_boot, __entry->cl_id,
- __entry->addr, __entry->prog, __entry->ident)
+ TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __entry->prog, __entry->ident)
);
TRACE_EVENT(nfsd_cb_nodelegs,
@@ -828,11 +875,6 @@ TRACE_EVENT(nfsd_cb_nodelegs,
TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id)
)
-TRACE_DEFINE_ENUM(NFSD4_CB_UP);
-TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN);
-TRACE_DEFINE_ENUM(NFSD4_CB_DOWN);
-TRACE_DEFINE_ENUM(NFSD4_CB_FAULT);
-
#define show_cb_state(val) \
__print_symbolic(val, \
{ NFSD4_CB_UP, "UP" }, \
@@ -866,10 +908,53 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \
TP_PROTO(const struct nfs4_client *clp), \
TP_ARGS(clp))
-DEFINE_NFSD_CB_EVENT(setup);
DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(probe);
+DEFINE_NFSD_CB_EVENT(lost);
DEFINE_NFSD_CB_EVENT(shutdown);
+TRACE_DEFINE_ENUM(RPC_AUTH_NULL);
+TRACE_DEFINE_ENUM(RPC_AUTH_UNIX);
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS);
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5);
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5I);
+TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5P);
+
+#define show_nfsd_authflavor(val) \
+ __print_symbolic(val, \
+ { RPC_AUTH_NULL, "none" }, \
+ { RPC_AUTH_UNIX, "sys" }, \
+ { RPC_AUTH_GSS, "gss" }, \
+ { RPC_AUTH_GSS_KRB5, "krb5" }, \
+ { RPC_AUTH_GSS_KRB5I, "krb5i" }, \
+ { RPC_AUTH_GSS_KRB5P, "krb5p" })
+
+TRACE_EVENT(nfsd_cb_setup,
+ TP_PROTO(const struct nfs4_client *clp,
+ const char *netid,
+ rpc_authflavor_t authflavor
+ ),
+ TP_ARGS(clp, netid, authflavor),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(unsigned long, authflavor)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, netid, 8)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ strlcpy(__entry->netid, netid, sizeof(__entry->netid));
+ __entry->authflavor = authflavor;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __entry->netid, show_nfsd_authflavor(__entry->authflavor))
+);
+
TRACE_EVENT(nfsd_cb_setup_err,
TP_PROTO(
const struct nfs4_client *clp,
@@ -893,52 +978,97 @@ TRACE_EVENT(nfsd_cb_setup_err,
__entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
);
-TRACE_EVENT(nfsd_cb_work,
+TRACE_EVENT(nfsd_cb_recall,
TP_PROTO(
- const struct nfs4_client *clp,
- const char *procedure
+ const struct nfs4_stid *stid
),
- TP_ARGS(clp, procedure),
+ TP_ARGS(stid),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
- __string(procedure, procedure)
+ __field(u32, si_id)
+ __field(u32, si_generation)
__array(unsigned char, addr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
+ const stateid_t *stp = &stid->sc_stateid;
+ const struct nfs4_client *clp = stid->sc_client;
+
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ if (clp)
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ else
+ memset(__entry->addr, 0, sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __entry->si_id, __entry->si_generation)
+);
+
+TRACE_EVENT(nfsd_cb_notify_lock,
+ TP_PROTO(
+ const struct nfs4_lockowner *lo,
+ const struct nfsd4_blocked_lock *nbl
+ ),
+ TP_ARGS(lo, nbl),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, fh_hash)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ const struct nfs4_client *clp = lo->lo_owner.so_client;
+
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- __assign_str(procedure, procedure)
+ __entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh);
memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
sizeof(struct sockaddr_in6));
),
- TP_printk("addr=%pISpc client %08x:%08x procedure=%s",
+ TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x",
__entry->addr, __entry->cl_boot, __entry->cl_id,
- __get_str(procedure))
+ __entry->fh_hash)
);
-TRACE_EVENT(nfsd_cb_done,
+TRACE_EVENT(nfsd_cb_offload,
TP_PROTO(
const struct nfs4_client *clp,
- int status
+ const stateid_t *stp,
+ const struct knfsd_fh *fh,
+ u64 count,
+ __be32 status
),
- TP_ARGS(clp, status),
+ TP_ARGS(clp, stp, fh, count, status),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ __field(u32, fh_hash)
__field(int, status)
+ __field(u64, count)
__array(unsigned char, addr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
- __entry->cl_boot = clp->cl_clientid.cl_boot;
- __entry->cl_id = clp->cl_clientid.cl_id;
- __entry->status = status;
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ __entry->fh_hash = knfsd_fh_hash(fh);
+ __entry->status = be32_to_cpu(status);
+ __entry->count = count;
memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
sizeof(struct sockaddr_in6));
),
- TP_printk("addr=%pISpc client %08x:%08x status=%d",
+ TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d",
__entry->addr, __entry->cl_boot, __entry->cl_id,
- __entry->status)
+ __entry->si_id, __entry->si_generation,
+ __entry->fh_hash, __entry->count, __entry->status)
);
#endif /* _NFSD_TRACE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 15adf1f6ab21..a224a5e23cc1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1123,6 +1123,19 @@ out:
}
#ifdef CONFIG_NFSD_V3
+static int
+nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset,
+ loff_t end)
+{
+ struct address_space *mapping = nf->nf_file->f_mapping;
+ int ret = filemap_fdatawrite_range(mapping, offset, end);
+
+ if (ret)
+ return ret;
+ filemap_fdatawait_range_keep_errors(mapping, offset, end);
+ return 0;
+}
+
/*
* Commit all pending writes to stable storage.
*
@@ -1153,10 +1166,11 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
goto out;
if (EX_ISSYNC(fhp->fh_export)) {
- int err2;
+ int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end);
down_write(&nf->nf_rwsem);
- err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
+ if (!err2)
+ err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
switch (err2) {
case 0:
nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
@@ -1613,9 +1627,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
err = nfserrno(host_err);
+ fh_unlock(fhp);
if (!err)
err = nfserrno(commit_metadata(fhp));
- fh_unlock(fhp);
fh_drop_write(fhp);
@@ -1680,6 +1694,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
if (d_really_is_negative(dold))
goto out_dput;
host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
+ fh_unlock(ffhp);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
if (!err)
@@ -1859,6 +1874,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
{
struct dentry *dentry, *rdentry;
struct inode *dirp;
+ struct inode *rinode;
__be32 err;
int host_err;
@@ -1887,6 +1903,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
host_err = -ENOENT;
goto out_drop_write;
}
+ rinode = d_inode(rdentry);
+ ihold(rinode);
if (!type)
type = d_inode(rdentry)->i_mode & S_IFMT;
@@ -1899,9 +1917,11 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
}
+ fh_unlock(fhp);
if (!host_err)
host_err = commit_metadata(fhp);
dput(rdentry);
+ iput(rinode); /* truncate the inode here */
out_drop_write:
fh_drop_write(fhp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index a7c425254fee..3e4052e3bd50 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -567,6 +567,7 @@ struct nfsd4_copy {
struct vfsmount *ss_mnt;
struct nfs_fh c_fh;
nfs4_stateid stateid;
+ bool committed;
};
struct nfsd4_seek {
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f42ab57201e7..ab9ec073330f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -738,7 +738,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
if (ptr2 != ptr + cnt || ++cnt == maxblocks)
goto end;
index++;
- continue;
}
if (level == maxlevel)
break;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 64864fb40b40..28b67cb9458d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -54,22 +54,27 @@ static int fanotify_max_queued_events __read_mostly;
#include <linux/sysctl.h>
+static long ft_zero = 0;
+static long ft_int_max = INT_MAX;
+
struct ctl_table fanotify_table[] = {
{
.procname = "max_user_groups",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &ft_zero,
+ .extra2 = &ft_int_max,
},
{
.procname = "max_user_marks",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &ft_zero,
+ .extra2 = &ft_int_max,
},
{
.procname = "max_queued_events",
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 98f61b31745a..62051247f6d2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -55,22 +55,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
#include <linux/sysctl.h>
+static long it_zero = 0;
+static long it_int_max = INT_MAX;
+
struct ctl_table inotify_table[] = {
{
.procname = "max_user_instances",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &it_zero,
+ .extra2 = &it_int_max,
},
{
.procname = "max_user_watches",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(long),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &it_zero,
+ .extra2 = &it_int_max,
},
{
.procname = "max_queued_events",
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index e5aab265dff1..ab4f3362466d 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1684,20 +1684,17 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
{
struct page **last_page = pages + nr_pages;
size_t total = 0;
- struct iov_iter data = *i;
unsigned len, copied;
do {
len = PAGE_SIZE - ofs;
if (len > bytes)
len = bytes;
- copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
- len);
+ copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
total += copied;
bytes -= copied;
if (!bytes)
break;
- iov_iter_advance(&data, copied);
if (copied < len)
goto err;
ofs = 0;
@@ -1866,34 +1863,24 @@ again:
if (likely(copied == bytes)) {
status = ntfs_commit_pages_after_write(pages, do_pages,
pos, bytes);
- if (!status)
- status = bytes;
}
do {
unlock_page(pages[--do_pages]);
put_page(pages[do_pages]);
} while (do_pages);
- if (unlikely(status < 0))
+ if (unlikely(status < 0)) {
+ iov_iter_revert(i, copied);
break;
- copied = status;
+ }
cond_resched();
- if (unlikely(!copied)) {
- size_t sc;
-
- /*
- * We failed to copy anything. Fall back to single
- * segment length write.
- *
- * This is needed to avoid possible livelock in the
- * case that all segments in the iov cannot be copied
- * at once without a pagefault.
- */
- sc = iov_iter_single_seg_count(i);
- if (bytes > sc)
- bytes = sc;
+ if (unlikely(copied < bytes)) {
+ iov_iter_revert(i, copied);
+ if (copied)
+ bytes = copied;
+ else if (bytes > PAGE_SIZE - ofs)
+ bytes = PAGE_SIZE - ofs;
goto again;
}
- iov_iter_advance(i, copied);
pos += copied;
written += copied;
balance_dirty_pages_ratelimited(mapping);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 775657943057..54d7843c0211 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
}
}
+/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ * is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u64 start_block, end_block, nr_blocks;
+ u64 p_block, offset;
+ u32 cluster, p_cluster, nr_clusters;
+ struct super_block *sb = inode->i_sb;
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+ if (start + len < end)
+ end = start + len;
+
+ start_block = ocfs2_blocks_for_bytes(sb, start);
+ end_block = ocfs2_blocks_for_bytes(sb, end);
+ nr_blocks = end_block - start_block;
+ if (!nr_blocks)
+ return 0;
+
+ cluster = ocfs2_bytes_to_clusters(sb, start);
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+ &nr_clusters, NULL);
+ if (ret)
+ return ret;
+ if (!p_cluster)
+ return 0;
+
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
static int ocfs2_zero_partial_clusters(struct inode *inode,
u64 start, u64 len)
{
@@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
unsigned int csize = osb->s_clustersize;
handle_t *handle;
+ loff_t isize = i_size_read(inode);
/*
* The "start" and "end" values are NOT necessarily part of
@@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
goto out;
+ /* No page cache for EOF blocks, issue zero out to disk. */
+ if (end > isize) {
+ /*
+ * zeroout eof blocks in last cluster starting from
+ * "isize" even "start" > "isize" because it is
+ * complicated to zeroout just at "start" as "start"
+ * may be not aligned with block size, buffer write
+ * would be required to do that, but out of eof buffer
+ * write is not supported.
+ */
+ ret = ocfs2_zeroout_partial_cluster(inode, isize,
+ end - isize);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (start >= isize)
+ goto out;
+ end = isize;
+ }
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -1856,45 +1916,6 @@ out:
}
/*
- * zero out partial blocks of one cluster.
- *
- * start: file offset where zero starts, will be made upper block aligned.
- * len: it will be trimmed to the end of current cluster if "start + len"
- * is bigger than it.
- */
-static int ocfs2_zeroout_partial_cluster(struct inode *inode,
- u64 start, u64 len)
-{
- int ret;
- u64 start_block, end_block, nr_blocks;
- u64 p_block, offset;
- u32 cluster, p_cluster, nr_clusters;
- struct super_block *sb = inode->i_sb;
- u64 end = ocfs2_align_bytes_to_clusters(sb, start);
-
- if (start + len < end)
- end = start + len;
-
- start_block = ocfs2_blocks_for_bytes(sb, start);
- end_block = ocfs2_blocks_for_bytes(sb, end);
- nr_blocks = end_block - start_block;
- if (!nr_blocks)
- return 0;
-
- cluster = ocfs2_bytes_to_clusters(sb, start);
- ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
- &nr_clusters, NULL);
- if (ret)
- return ret;
- if (!p_cluster)
- return 0;
-
- offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
- p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
- return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
-}
-
-/*
* Parts of this function taken from xfs_change_file_space()
*/
static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
@@ -1935,7 +1956,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- orig_isize = i_size_read(inode);
switch (sr->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -1943,7 +1963,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
sr->l_start += f_pos;
break;
case 2: /*SEEK_END*/
- sr->l_start += orig_isize;
+ sr->l_start += i_size_read(inode);
break;
default:
ret = -EINVAL;
@@ -1998,6 +2018,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
ret = -EINVAL;
}
+ orig_isize = i_size_read(inode);
/* zeroout eof blocks in the cluster. */
if (!ret && change_size && orig_isize < size) {
ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
diff --git a/fs/open.c b/fs/open.c
index 53bc0573c0ec..94bef26ff1b6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -852,8 +852,17 @@ static int do_dentry_open(struct file *f,
* XXX: Huge page cache doesn't support writing yet. Drop all page
* cache for this file before processing writes.
*/
- if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
- truncate_pagecache(inode, 0);
+ if (f->f_mode & FMODE_WRITE) {
+ /*
+ * Paired with smp_mb() in collapse_file() to ensure nr_thps
+ * is up to date and the update to i_writecount by
+ * get_write_access() is visible. Ensures subsequent insertion
+ * of THPs into the page cache will fail.
+ */
+ smp_mb();
+ if (filemap_nr_thps(inode->i_mapping))
+ truncate_pagecache(inode, 0);
+ }
return 0;
@@ -1164,7 +1173,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
}
EXPORT_SYMBOL(filp_open);
-struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+struct file *file_open_root(const struct path *root,
const char *filename, int flags, umode_t mode)
{
struct open_flags op;
@@ -1172,7 +1181,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
int err = build_open_flags(&how, &op);
if (err)
return ERR_PTR(err);
- return do_file_open_root(dentry, mnt, filename, &op);
+ return do_file_open_root(root, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 6bf35a0d61f3..16ac617df7d7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -249,8 +249,7 @@ static void orangefs_readahead(struct readahead_control *rac)
{
loff_t offset;
struct iov_iter iter;
- struct file *file = rac->file;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = rac->mapping->host;
struct xarray *i_pages;
struct page *page;
loff_t new_start = readahead_pos(rac);
@@ -269,14 +268,14 @@ static void orangefs_readahead(struct readahead_control *rac)
readahead_expand(rac, new_start, new_len);
offset = readahead_pos(rac);
- i_pages = &file->f_mapping->i_pages;
+ i_pages = &rac->mapping->i_pages;
iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
/* read in the pages. */
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
&offset, &iter, readahead_length(rac),
- inode->i_size, NULL, NULL, file)) < 0)
+ inode->i_size, NULL, NULL, rac->file)) < 0)
gossip_debug(GOSSIP_FILE_DEBUG,
"%s: wait_for_direct_io failed. \n", __func__);
else
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index ee5efdc35cc1..2f2e430461b2 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -209,7 +209,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
- buf->f_frsize = sb->s_blocksize;
+ buf->f_frsize = 0;
out_op_release:
op_release(new_op);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 41ebf52f1bbc..ebde05c9cf62 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -392,6 +392,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
*/
take_dentry_name_snapshot(&name, real);
this = lookup_one_len(name.name.name, connected, name.name.len);
+ release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
goto fail;
@@ -406,7 +407,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
}
out:
- release_dentry_name_snapshot(&name);
dput(parent);
inode_unlock(dir);
return this;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4d53d3b7e5fe..d081faa55e83 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -392,6 +392,51 @@ out_unlock:
return ret;
}
+/*
+ * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
+ * due to lock order inversion between pipe->mutex in iter_file_splice_write()
+ * and file_start_write(real.file) in ovl_write_iter().
+ *
+ * So do everything ovl_write_iter() does and call iter_file_splice_write() on
+ * the real file.
+ */
+static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ struct inode *inode = file_inode(out);
+ struct inode *realinode = ovl_inode_real(inode);
+ ssize_t ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(realinode, inode);
+ ret = file_remove_privs(out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ file_start_write(real.file);
+
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(realinode, inode);
+ revert_creds(old_cred);
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
@@ -603,7 +648,7 @@ const struct file_operations ovl_file_operations = {
.fadvise = ovl_fadvise,
.flush = ovl_flush,
.splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index e8ad2c2c77dd..150fdf3bc68d 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -481,6 +481,8 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
}
this = lookup_one_len(p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
+ /* Mark a stale entry */
+ p->is_whiteout = true;
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -776,6 +778,9 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
if (err)
goto out;
}
+ }
+ /* ovl_cache_update_ino() sets is_whiteout on stale entry */
+ if (!p->is_whiteout) {
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index bfd946a9ad01..678dee2a8228 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -32,6 +32,21 @@
#include "internal.h"
/*
+ * New pipe buffers will be restricted to this size while the user is exceeding
+ * their pipe buffer quota. The general pipe use case needs at least two
+ * buffers: one for data yet to be read, and one for new data. If this is less
+ * than two, then a write to a non-empty pipe may block even if the pipe is not
+ * full. This can occur with GNU make jobserver or similar uses of pipes as
+ * semaphores: multiple processes may be waiting to write tokens back to the
+ * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
+ *
+ * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
+ * own risk, namely: pipe writes to non-full pipes may block until the pipe is
+ * emptied.
+ */
+#define PIPE_MIN_DEF_BUFFERS 2
+
+/*
* The max size that a non-root user is allowed to grow the pipe. Can
* be set by root in /proc/sys/fs/pipe-max-size
*/
@@ -429,14 +444,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
#endif
/*
- * Only wake up if the pipe started out empty, since
- * otherwise there should be no readers waiting.
- *
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
- * page-aligs the rest of the writes for large writes
+ * page-aligns the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
@@ -575,8 +587,11 @@ out:
* This is particularly important for small writes, because of
* how (for example) the GNU make jobserver uses small writes to
* wake up pending jobs
+ *
+ * Epoll nonsensically wants a wakeup whether the pipe
+ * was already empty or not.
*/
- if (was_empty) {
+ if (was_empty || pipe->poll_usage) {
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
@@ -639,6 +654,9 @@ pipe_poll(struct file *filp, poll_table *wait)
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head, tail;
+ /* Epoll has some historical nasty semantics, this enables them */
+ pipe->poll_usage = 1;
+
/*
* Reading pipe state only -- no need for acquiring the semaphore.
*
@@ -781,8 +799,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
- user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
- pipe_bufs = 1;
+ user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
+ pipe_bufs = PIPE_MIN_DEF_BUFFERS;
}
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9cbd915025ad..e5b5f7709d48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -854,7 +854,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
while (count > 0) {
- int this_len = min_t(int, count, PAGE_SIZE);
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
@@ -3172,7 +3172,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3517,7 +3517,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = {
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index ad31ec4ad627..6d8d4bf20837 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -49,7 +49,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
else
q = '"';
ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
- q, val, q, vnode->next ? ", " : "\n");
+ q, val, q, xbc_node_is_array(vnode) ? ", " : "\n");
if (ret < 0)
goto out;
dst += ret;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 07fc4fad2602..172c86270b31 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -6,6 +6,7 @@
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/pid.h>
+#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -53,9 +54,10 @@ static int seq_show(struct seq_file *m, void *v)
if (ret)
return ret;
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
(long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
+ real_mount(file->f_path.mnt)->mnt_id,
+ file_inode(file)->i_ino);
/* show_fd_locks() never deferences files so a stale value is safe */
show_fd_locks(m, file, files);
@@ -72,6 +74,18 @@ out:
static int seq_fdinfo_open(struct inode *inode, struct file *file)
{
+ bool allowed = false;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+
+ if (!allowed)
+ return -EACCES;
+
return single_open(file, seq_show, inode);
}
@@ -308,7 +322,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 4d2e64e9016c..982e694aae77 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -313,6 +313,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
char *buf = file->private_data;
size_t phdrs_offset, notes_offset, data_offset;
+ size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
struct kcore_list *m;
size_t tsz;
@@ -322,6 +323,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
int ret = 0;
down_read(&kclist_lock);
+ /*
+ * Don't race against drivers that set PageOffline() and expect no
+ * further page access.
+ */
+ page_offline_freeze();
get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
phdrs_offset = sizeof(struct elfhdr);
@@ -380,11 +386,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
- if (m->type == KCORE_REMAP)
- phdr->p_vaddr = (size_t)m->vaddr;
- else
- phdr->p_vaddr = (size_t)m->addr;
- if (m->type == KCORE_RAM || m->type == KCORE_REMAP)
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
phdr->p_paddr = __pa(m->addr);
else if (m->type == KCORE_TEXT)
phdr->p_paddr = __pa_symbol(m->addr);
@@ -468,6 +471,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
m = NULL;
while (buflen) {
+ struct page *page;
+ unsigned long pfn;
+
/*
* If this is the first iteration or the address is not within
* the previous entry, search for a matching entry.
@@ -480,31 +486,57 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
}
+ if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) {
+ page_offline_thaw();
+ cond_resched();
+ page_offline_freeze();
+ }
+
if (&m->list == &kclist_head) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
}
m = NULL; /* skip the list anchor */
- } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else if (m->type == KCORE_VMALLOC) {
+ goto skip;
+ }
+
+ switch (m->type) {
+ case KCORE_VMALLOC:
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
- } else if (m->type == KCORE_USER) {
+ break;
+ case KCORE_USER:
/* User page is handled prior to normal kernel page: */
if (copy_to_user(buffer, (char *)start, tsz)) {
ret = -EFAULT;
goto out;
}
- } else {
+ break;
+ case KCORE_RAM:
+ pfn = __pa(start) >> PAGE_SHIFT;
+ page = pfn_to_online_page(pfn);
+
+ /*
+ * Don't read offline sections, logically offline pages
+ * (e.g., inflated in a balloon), hwpoisoned pages,
+ * and explicitly excluded physical ranges.
+ */
+ if (!page || PageOffline(page) ||
+ is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ }
+ fallthrough;
+ case KCORE_VMEMMAP:
+ case KCORE_TEXT:
if (kern_addr_valid(start)) {
/*
* Using bounce buffer to bypass the
@@ -528,7 +560,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
goto out;
}
}
+ break;
+ default:
+ pr_warn_once("Unhandled KCORE type: %d\n", m->type);
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
+skip:
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -537,6 +577,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
out:
+ page_offline_thaw();
up_read(&kclist_lock);
if (ret)
return ret;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dea0f5ee540c..5d66faecd4ef 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1807,7 +1807,7 @@ static int process_sysctl_arg(char *param, char *val,
panic("%s: Failed to allocate path for %s\n", __func__, param);
strreplace(path, '.', '/');
- file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
+ file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
if (IS_ERR(file)) {
err = PTR_ERR(file);
if (err == -ENOENT)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 66965ad88d8b..eb97468dfe4c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -514,10 +514,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ } else if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = xa_load(&vma->vm_file->f_mapping->i_pages,
@@ -549,7 +547,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
swp_entry_t entry = pmd_to_swp_entry(*pmd);
if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
}
if (IS_ERR_OR_NULL(page))
return;
@@ -694,10 +692,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -832,7 +828,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_enabled(vma));
+ transparent_hugepage_active(vma));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1302,6 +1298,7 @@ struct pagemapread {
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1375,20 +1372,21 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
+ if (pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
+ if (pte_swp_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
if (pm->show_pfn)
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
- if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
-
- if (is_device_private_entry(entry))
- page = device_private_entry_to_page(entry);
+ if (is_pfn_swap_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
}
if (page && !PageAnon(page))
@@ -1426,6 +1424,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
flags |= PM_PRESENT;
if (pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1444,8 +1444,10 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
}
#endif
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 476a7ff49482..ef42729216d1 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -387,6 +387,24 @@ void pathrelse(struct treepath *search_path)
search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
}
+static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
+{
+ struct reiserfs_de_head *deh;
+ int i;
+
+ deh = B_I_DEH(bh, ih);
+ for (i = 0; i < ih_entry_count(ih); i++) {
+ if (deh_location(&deh[i]) > ih_item_len(ih)) {
+ reiserfs_warning(NULL, "reiserfs-5094",
+ "directory entry location seems wrong %h",
+ &deh[i]);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
{
struct block_head *blkh;
@@ -454,11 +472,14 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
"(second one): %h", ih);
return 0;
}
- if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) {
- reiserfs_warning(NULL, "reiserfs-5093",
- "item entry count seems wrong %h",
- ih);
- return 0;
+ if (is_direntry_le_ih(ih)) {
+ if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
+ reiserfs_warning(NULL, "reiserfs-5093",
+ "item entry count seems wrong %h",
+ ih);
+ return 0;
+ }
+ return has_valid_deh_location(bh, ih);
}
prev_location = ih_location(ih);
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3ffafc73acf0..58481f8d63d5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2082,6 +2082,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
unlock_new_inode(root_inode);
}
+ if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
+ !root_inode->i_size) {
+ SWARN(silent, s, "", "corrupt root inode, run fsck");
+ iput(root_inode);
+ errval = -EUCLEAN;
+ goto error;
+ }
+
s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto error;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5059248f2d64..4a2cda04d3e2 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -32,6 +32,9 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
+ if (unlikely(size > MAX_RW_COUNT))
+ return NULL;
+
return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}
@@ -356,6 +359,31 @@ int seq_release(struct inode *inode, struct file *file)
EXPORT_SYMBOL(seq_release);
/**
+ * seq_escape_mem - print data into buffer, escaping some characters
+ * @m: target buffer
+ * @src: source buffer
+ * @len: size of source buffer
+ * @flags: flags to pass to string_escape_mem()
+ * @esc: set of characters that need escaping
+ *
+ * Puts data into buffer, replacing each occurrence of character from
+ * given class (defined by @flags and @esc) with printable escaped sequence.
+ *
+ * Use seq_has_overflowed() to check for errors.
+ */
+void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
+ unsigned int flags, const char *esc)
+{
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
+
+ ret = string_escape_mem(src, len, buf, size, flags, esc);
+ seq_commit(m, ret < size ? ret : -1);
+}
+EXPORT_SYMBOL(seq_escape_mem);
+
+/**
* seq_escape - print string into buffer, escaping some characters
* @m: target buffer
* @s: string
@@ -367,26 +395,10 @@ EXPORT_SYMBOL(seq_release);
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *buf;
- size_t size = seq_get_buf(m, &buf);
- int ret;
-
- ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
- seq_commit(m, ret < size ? ret : -1);
+ seq_escape_str(m, s, ESCAPE_OCTAL, esc);
}
EXPORT_SYMBOL(seq_escape);
-void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
-{
- char *buf;
- size_t size = seq_get_buf(m, &buf);
- int ret;
-
- ret = string_escape_mem_ascii(src, isz, buf, size);
- seq_commit(m, ret < size ? ret : -1);
-}
-EXPORT_SYMBOL(seq_escape_mem_ascii);
-
void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 1bbb9fe661b1..fc718f6178f2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2824,7 +2824,7 @@ void dbg_debugfs_init_fs(struct ubifs_info *c)
n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
c->vi.ubi_num, c->vi.vol_id);
- if (n == UBIFS_DFS_DIR_LEN) {
+ if (n > UBIFS_DFS_DIR_LEN) {
/* The array size is too small */
return;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5bd8482e660a..7c61d0ec0159 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1337,7 +1337,10 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_release;
}
+ spin_lock(&whiteout->i_lock);
whiteout->i_state |= I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
whiteout_ui = ubifs_inode(whiteout);
whiteout_ui->data = dev;
whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
@@ -1430,7 +1433,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
inc_nlink(whiteout);
mark_inode_dirty(whiteout);
+
+ spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
iput(whiteout);
}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 2857e64d673d..8ea680dba61e 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -882,6 +882,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
struct ubifs_dent_node *xent, *pxent = NULL;
if (ui->xattr_cnt > ubifs_xattr_max_cnt(c)) {
+ err = -EPERM;
ubifs_err(c, "Cannot delete inode, it has too much xattrs!");
goto out_release;
}
@@ -1431,7 +1432,7 @@ out_free:
/**
* truncate_data_node - re-compress/encrypt a truncated data node.
* @c: UBIFS file-system description object
- * @inode: inode which referes to the data node
+ * @inode: inode which refers to the data node
* @block: data block number
* @dn: data node to re-compress
* @new_len: new length
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 0df9a3dd0aaa..7adc37c10b6a 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -37,7 +37,7 @@ int ubifs_compare_master_node(struct ubifs_info *c, void *m1, void *m2)
return ret;
/*
- * Do not compare the embedded HMAC aswell which also must be different
+ * Do not compare the embedded HMAC as well which also must be different
* due to the different common node header.
*/
behind = hmac_offs + UBIFS_MAX_HMAC_LEN;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 382a54c82930..5260d3e531bb 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -296,7 +296,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
* @b: second replay entry
*
* This is a comparios function for 'list_sort()' which compares 2 replay
- * entries @a and @b by comparing their sequence numer. Returns %1 if @a has
+ * entries @a and @b by comparing their sequence number. Returns %1 if @a has
* greater sequence number and %-1 otherwise.
*/
static int replay_entries_cmp(void *priv, const struct list_head *a,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7b572e1414ba..f0fb25727d96 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -275,6 +275,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
memset((void *)ui + sizeof(struct inode), 0,
sizeof(struct ubifs_inode) - sizeof(struct inode));
mutex_init(&ui->ui_mutex);
+ init_rwsem(&ui->xattr_sem);
spin_lock_init(&ui->ui_lock);
return &ui->vfs_inode;
};
@@ -2060,7 +2061,7 @@ const struct super_operations ubifs_super_operations = {
* @mode: UBI volume open mode
*
* The primary method of mounting UBIFS is by specifying the UBI volume
- * character device node path. However, UBIFS may also be mounted withoug any
+ * character device node path. However, UBIFS may also be mounted without any
* character device node using one of the following methods:
*
* o ubiX_Y - mount UBI device number X, volume Y;
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 234be1c4dc87..58c92c96ecef 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -930,7 +930,7 @@ static int write_index(struct ubifs_info *c)
* flag cleared before %COW_ZNODE. Specifically, it matters in
* the 'dirty_cow_znode()' function. This is the reason for the
* first barrier. Also, we want the bit changes to be seen to
- * other threads ASAP, to avoid unnecesarry copying, which is
+ * other threads ASAP, to avoid unnecessary copying, which is
* the reason for the second barrier.
*/
clear_bit(DIRTY_ZNODE, &znode->flags);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b65c599a386a..c38066ce9ab0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -356,6 +356,7 @@ struct ubifs_gced_idx_leb {
* @ui_mutex: serializes inode write-back with the rest of VFS operations,
* serializes "clean <-> dirty" state changes, serializes bulk-read,
* protects @dirty, @bulk_read, @ui_size, and @xattr_size
+ * @xattr_sem: serilizes write operations (remove|set|create) on xattr
* @ui_lock: protects @synced_i_size
* @synced_i_size: synchronized size of inode, i.e. the value of inode size
* currently stored on the flash; used only for regular file
@@ -409,6 +410,7 @@ struct ubifs_inode {
unsigned int bulk_read:1;
unsigned int compr_type:2;
struct mutex ui_mutex;
+ struct rw_semaphore xattr_sem;
spinlock_t ui_lock;
loff_t synced_i_size;
loff_t ui_size;
@@ -912,7 +914,7 @@ struct ubifs_budget_req {
* @rb: rb-tree node of rb-tree of orphans sorted by inode number
* @list: list head of list of orphans in order added
* @new_list: list head of list of orphans added since the last commit
- * @child_list: list of xattr childs if this orphan hosts xattrs, list head
+ * @child_list: list of xattr children if this orphan hosts xattrs, list head
* if this orphan is a xattr, not used otherwise.
* @cnext: next orphan to commit
* @dnext: next orphan to delete
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 6b1e9830b274..e4f193eae4b2 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -208,13 +208,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
err = -ENOMEM;
goto out_free;
}
- mutex_lock(&ui->ui_mutex);
kfree(ui->data);
ui->data = buf;
inode->i_size = ui->ui_size = size;
old_size = ui->data_len;
ui->data_len = size;
- mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = current_time(host);
@@ -285,6 +283,7 @@ int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
if (!xent)
return -ENOMEM;
+ down_write(&ubifs_inode(host)->xattr_sem);
/*
* The extended attribute entries are stored in LNC, so multiple
* look-ups do not involve reading the flash.
@@ -319,6 +318,7 @@ int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
iput(inode);
out_free:
+ up_write(&ubifs_inode(host)->xattr_sem);
kfree(xent);
return err;
}
@@ -341,25 +341,25 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
if (!xent)
return -ENOMEM;
+ down_read(&ubifs_inode(host)->xattr_sem);
xent_key_init(c, &key, host->i_ino, &nm);
err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
if (err) {
if (err == -ENOENT)
err = -ENODATA;
- goto out_unlock;
+ goto out_cleanup;
}
inode = iget_xattr(c, le64_to_cpu(xent->inum));
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_unlock;
+ goto out_cleanup;
}
ui = ubifs_inode(inode);
ubifs_assert(c, inode->i_size == ui->data_len);
ubifs_assert(c, ubifs_inode(host)->xattr_size > ui->data_len);
- mutex_lock(&ui->ui_mutex);
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
@@ -372,9 +372,9 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
err = ui->data_len;
out_iput:
- mutex_unlock(&ui->ui_mutex);
iput(inode);
-out_unlock:
+out_cleanup:
+ up_read(&ubifs_inode(host)->xattr_sem);
kfree(xent);
return err;
}
@@ -406,16 +406,21 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
dentry, size);
+ down_read(&host_ui->xattr_sem);
len = host_ui->xattr_names + host_ui->xattr_cnt;
- if (!buffer)
+ if (!buffer) {
/*
* We should return the minimum buffer size which will fit a
* null-terminated list of all the extended attribute names.
*/
- return len;
+ err = len;
+ goto out_err;
+ }
- if (len > size)
- return -ERANGE;
+ if (len > size) {
+ err = -ERANGE;
+ goto out_err;
+ }
lowest_xent_key(c, &key, host->i_ino);
while (1) {
@@ -437,8 +442,9 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
pxent = xent;
key_read(c, &xent->key, &key);
}
-
kfree(pxent);
+ up_read(&host_ui->xattr_sem);
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
@@ -446,6 +452,10 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
ubifs_assert(c, written <= size);
return written;
+
+out_err:
+ up_read(&host_ui->xattr_sem);
+ return err;
}
static int remove_xattr(struct ubifs_info *c, struct inode *host,
@@ -504,6 +514,7 @@ int ubifs_purge_xattrs(struct inode *host)
ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion",
host->i_ino);
+ down_write(&ubifs_inode(host)->xattr_sem);
lowest_xent_key(c, &key, host->i_ino);
while (1) {
xent = ubifs_tnc_next_ent(c, &key, &nm);
@@ -523,7 +534,7 @@ int ubifs_purge_xattrs(struct inode *host)
ubifs_ro_mode(c, err);
kfree(pxent);
kfree(xent);
- return err;
+ goto out_err;
}
ubifs_assert(c, ubifs_inode(xino)->xattr);
@@ -535,7 +546,7 @@ int ubifs_purge_xattrs(struct inode *host)
kfree(xent);
iput(xino);
ubifs_err(c, "cannot remove xattr, error %d", err);
- return err;
+ goto out_err;
}
iput(xino);
@@ -544,14 +555,19 @@ int ubifs_purge_xattrs(struct inode *host)
pxent = xent;
key_read(c, &xent->key, &key);
}
-
kfree(pxent);
+ up_write(&ubifs_inode(host)->xattr_sem);
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
return 0;
+
+out_err:
+ up_write(&ubifs_inode(host)->xattr_sem);
+ return err;
}
/**
@@ -594,6 +610,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name)
if (!xent)
return -ENOMEM;
+ down_write(&ubifs_inode(host)->xattr_sem);
xent_key_init(c, &key, host->i_ino, &nm);
err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
if (err) {
@@ -618,6 +635,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name)
iput(inode);
out_free:
+ up_write(&ubifs_inode(host)->xattr_sem);
kfree(xent);
return err;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index dd7a6c62b56f..5c2d806e6ae5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 *start, __u64 len)
+ __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- *start = untagged_addr(*start);
-
- if (*start & ~PAGE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (*start < mmap_min_addr)
+ if (start < mmap_min_addr)
return -EINVAL;
- if (*start >= task_size)
+ if (start >= task_size)
return -EINVAL;
- if (len > task_size - *start)
+ if (len > task_size - start)
return -EINVAL;
return 0;
}
@@ -1267,8 +1265,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
}
if (vm_flags & VM_UFFD_MINOR) {
- /* FIXME: Add minor fault interception for shmem. */
- if (!is_vm_hugetlb_page(vma))
+ if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
return false;
}
@@ -1304,8 +1301,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags = 0;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
- if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+ goto out;
+#endif
vm_flags |= VM_UFFD_WP;
+ }
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
goto out;
@@ -1313,7 +1314,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_MINOR;
}
- ret = validate_range(mm, &uffdio_register.range.start,
+ ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1519,7 +1520,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, &uffdio_unregister.start,
+ ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1668,7 +1669,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1708,7 +1709,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1765,7 +1766,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
@@ -1815,7 +1816,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
sizeof(struct uffdio_writeprotect)))
return -EFAULT;
- ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+ ret = validate_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len);
if (ret)
return ret;
@@ -1863,7 +1864,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
sizeof(uffdio_continue) - (sizeof(__s64))))
goto out;
- ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+ ret = validate_range(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len);
if (ret)
goto out;
@@ -1941,7 +1942,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
- uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+#endif
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index eac6788fc6cf..c4769a9396c5 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -253,7 +253,7 @@ static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry,
}
static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
- umode_t mode, int is_dir)
+ umode_t mode, bool is_dir, bool excl, u64 *handle_ret)
{
struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
@@ -261,10 +261,12 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
int err;
params.handle = SHFL_HANDLE_NIL;
- params.create_flags = SHFL_CF_ACT_CREATE_IF_NEW |
- SHFL_CF_ACT_FAIL_IF_EXISTS |
- SHFL_CF_ACCESS_READWRITE |
- (is_dir ? SHFL_CF_DIRECTORY : 0);
+ params.create_flags = SHFL_CF_ACT_CREATE_IF_NEW | SHFL_CF_ACCESS_READWRITE;
+ if (is_dir)
+ params.create_flags |= SHFL_CF_DIRECTORY;
+ if (excl)
+ params.create_flags |= SHFL_CF_ACT_FAIL_IF_EXISTS;
+
params.info.attr.mode = (mode & 0777) |
(is_dir ? SHFL_TYPE_DIRECTORY : SHFL_TYPE_FILE);
params.info.attr.additional = SHFLFSOBJATTRADD_NOTHING;
@@ -276,30 +278,81 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
if (params.result != SHFL_FILE_CREATED)
return -EPERM;
- vboxsf_close(sbi->root, params.handle);
-
err = vboxsf_dir_instantiate(parent, dentry, &params.info);
if (err)
- return err;
+ goto out;
/* parent directory access/change time changed */
sf_parent_i->force_restat = 1;
- return 0;
+out:
+ if (err == 0 && handle_ret)
+ *handle_ret = params.handle;
+ else
+ vboxsf_close(sbi->root, params.handle);
+
+ return err;
}
static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns,
struct inode *parent, struct dentry *dentry,
umode_t mode, bool excl)
{
- return vboxsf_dir_create(parent, dentry, mode, 0);
+ return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL);
}
static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns,
struct inode *parent, struct dentry *dentry,
umode_t mode)
{
- return vboxsf_dir_create(parent, dentry, mode, 1);
+ return vboxsf_dir_create(parent, dentry, mode, true, true, NULL);
+}
+
+static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
+ struct file *file, unsigned int flags, umode_t mode)
+{
+ struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
+ struct vboxsf_handle *sf_handle;
+ struct dentry *res = NULL;
+ u64 handle;
+ int err;
+
+ if (d_in_lookup(dentry)) {
+ res = vboxsf_dir_lookup(parent, dentry, 0);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ if (res)
+ dentry = res;
+ }
+
+ /* Only creates */
+ if (!(flags & O_CREAT) || d_really_is_positive(dentry))
+ return finish_no_open(file, res);
+
+ err = vboxsf_dir_create(parent, dentry, mode, false, flags & O_EXCL, &handle);
+ if (err)
+ goto out;
+
+ sf_handle = vboxsf_create_sf_handle(d_inode(dentry), handle, SHFL_CF_ACCESS_READWRITE);
+ if (IS_ERR(sf_handle)) {
+ vboxsf_close(sbi->root, handle);
+ err = PTR_ERR(sf_handle);
+ goto out;
+ }
+
+ err = finish_open(file, dentry, generic_file_open);
+ if (err) {
+ /* This also closes the handle passed to vboxsf_create_sf_handle() */
+ vboxsf_release_sf_handle(d_inode(dentry), sf_handle);
+ goto out;
+ }
+
+ file->private_data = sf_handle;
+ file->f_mode |= FMODE_CREATED;
+out:
+ dput(res);
+ return err;
}
static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)
@@ -422,6 +475,7 @@ const struct inode_operations vboxsf_dir_iops = {
.lookup = vboxsf_dir_lookup,
.create = vboxsf_dir_mkfile,
.mkdir = vboxsf_dir_mkdir,
+ .atomic_open = vboxsf_dir_atomic_open,
.rmdir = vboxsf_dir_unlink,
.unlink = vboxsf_dir_unlink,
.rename = vboxsf_dir_rename,
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index c4ab5996d97a..864c2fad23be 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -20,17 +20,39 @@ struct vboxsf_handle {
struct list_head head;
};
-static int vboxsf_file_open(struct inode *inode, struct file *file)
+struct vboxsf_handle *vboxsf_create_sf_handle(struct inode *inode,
+ u64 handle, u32 access_flags)
{
struct vboxsf_inode *sf_i = VBOXSF_I(inode);
- struct shfl_createparms params = {};
struct vboxsf_handle *sf_handle;
- u32 access_flags = 0;
- int err;
sf_handle = kmalloc(sizeof(*sf_handle), GFP_KERNEL);
if (!sf_handle)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
+ /* the host may have given us different attr then requested */
+ sf_i->force_restat = 1;
+
+ /* init our handle struct and add it to the inode's handles list */
+ sf_handle->handle = handle;
+ sf_handle->root = VBOXSF_SBI(inode->i_sb)->root;
+ sf_handle->access_flags = access_flags;
+ kref_init(&sf_handle->refcount);
+
+ mutex_lock(&sf_i->handle_list_mutex);
+ list_add(&sf_handle->head, &sf_i->handle_list);
+ mutex_unlock(&sf_i->handle_list_mutex);
+
+ return sf_handle;
+}
+
+static int vboxsf_file_open(struct inode *inode, struct file *file)
+{
+ struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb);
+ struct shfl_createparms params = {};
+ struct vboxsf_handle *sf_handle;
+ u32 access_flags = 0;
+ int err;
/*
* We check the value of params.handle afterwards to find out if
@@ -83,23 +105,14 @@ static int vboxsf_file_open(struct inode *inode, struct file *file)
err = vboxsf_create_at_dentry(file_dentry(file), &params);
if (err == 0 && params.handle == SHFL_HANDLE_NIL)
err = (params.result == SHFL_FILE_EXISTS) ? -EEXIST : -ENOENT;
- if (err) {
- kfree(sf_handle);
+ if (err)
return err;
- }
-
- /* the host may have given us different attr then requested */
- sf_i->force_restat = 1;
- /* init our handle struct and add it to the inode's handles list */
- sf_handle->handle = params.handle;
- sf_handle->root = VBOXSF_SBI(inode->i_sb)->root;
- sf_handle->access_flags = access_flags;
- kref_init(&sf_handle->refcount);
-
- mutex_lock(&sf_i->handle_list_mutex);
- list_add(&sf_handle->head, &sf_i->handle_list);
- mutex_unlock(&sf_i->handle_list_mutex);
+ sf_handle = vboxsf_create_sf_handle(inode, params.handle, access_flags);
+ if (IS_ERR(sf_handle)) {
+ vboxsf_close(sbi->root, params.handle);
+ return PTR_ERR(sf_handle);
+ }
file->private_data = sf_handle;
return 0;
@@ -114,22 +127,26 @@ static void vboxsf_handle_release(struct kref *refcount)
kfree(sf_handle);
}
-static int vboxsf_file_release(struct inode *inode, struct file *file)
+void vboxsf_release_sf_handle(struct inode *inode, struct vboxsf_handle *sf_handle)
{
struct vboxsf_inode *sf_i = VBOXSF_I(inode);
- struct vboxsf_handle *sf_handle = file->private_data;
+ mutex_lock(&sf_i->handle_list_mutex);
+ list_del(&sf_handle->head);
+ mutex_unlock(&sf_i->handle_list_mutex);
+
+ kref_put(&sf_handle->refcount, vboxsf_handle_release);
+}
+
+static int vboxsf_file_release(struct inode *inode, struct file *file)
+{
/*
* When a file is closed on our (the guest) side, we want any subsequent
* accesses done on the host side to see all changes done from our side.
*/
filemap_write_and_wait(inode->i_mapping);
- mutex_lock(&sf_i->handle_list_mutex);
- list_del(&sf_handle->head);
- mutex_unlock(&sf_i->handle_list_mutex);
-
- kref_put(&sf_handle->refcount, vboxsf_handle_release);
+ vboxsf_release_sf_handle(inode, file->private_data);
return 0;
}
diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h
index 6a7a9cedebc6..9047befa66c5 100644
--- a/fs/vboxsf/vfsmod.h
+++ b/fs/vboxsf/vfsmod.h
@@ -18,6 +18,8 @@
#define VBOXSF_SBI(sb) ((struct vboxsf_sbi *)(sb)->s_fs_info)
#define VBOXSF_I(i) container_of(i, struct vboxsf_inode, vfs_inode)
+struct vboxsf_handle;
+
struct vboxsf_options {
unsigned long ttl;
kuid_t uid;
@@ -80,6 +82,11 @@ extern const struct file_operations vboxsf_reg_fops;
extern const struct address_space_operations vboxsf_reg_aops;
extern const struct dentry_operations vboxsf_dentry_ops;
+/* from file.c */
+struct vboxsf_handle *vboxsf_create_sf_handle(struct inode *inode,
+ u64 handle, u32 access_flags);
+void vboxsf_release_sf_handle(struct inode *inode, struct vboxsf_handle *sf_handle);
+
/* from utils.c */
struct inode *vboxsf_new_inode(struct super_block *sb);
int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index c68a36688474..ee9ec0c50bec 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -27,6 +27,276 @@
#include "xfs_defer.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+
+
+/*
+ * Passive reference counting access wrappers to the perag structures. If the
+ * per-ag structure is to be freed, the freeing code is responsible for cleaning
+ * up objects with passive references before freeing the structure. This is
+ * things like cached buffers.
+ */
+struct xfs_perag *
+xfs_perag_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ref = 0;
+
+ rcu_read_lock();
+ pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ if (pag) {
+ ASSERT(atomic_read(&pag->pag_ref) >= 0);
+ ref = atomic_inc_return(&pag->pag_ref);
+ }
+ rcu_read_unlock();
+ trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+ return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ unsigned int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+ int ref;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ ref = atomic_inc_return(&pag->pag_ref);
+ rcu_read_unlock();
+ trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+ return pag;
+}
+
+void
+xfs_perag_put(
+ struct xfs_perag *pag)
+{
+ int ref;
+
+ ASSERT(atomic_read(&pag->pag_ref) > 0);
+ ref = atomic_dec_return(&pag->pag_ref);
+ trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount)
+{
+ xfs_agnumber_t index;
+ struct xfs_perag *pag;
+ struct xfs_sb *sbp = &mp->m_sb;
+ uint64_t ifree = 0;
+ uint64_t ialloc = 0;
+ uint64_t bfree = 0;
+ uint64_t bfreelst = 0;
+ uint64_t btree = 0;
+ uint64_t fdblocks;
+ int error = 0;
+
+ for (index = 0; index < agcount; index++) {
+ /*
+ * read the agf, then the agi. This gets us
+ * all the information we need and populates the
+ * per-ag structures for us.
+ */
+ error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+ if (error)
+ return error;
+
+ error = xfs_ialloc_pagi_init(mp, NULL, index);
+ if (error)
+ return error;
+ pag = xfs_perag_get(mp, index);
+ ifree += pag->pagi_freecount;
+ ialloc += pag->pagi_count;
+ bfree += pag->pagf_freeblks;
+ bfreelst += pag->pagf_flcount;
+ btree += pag->pagf_btreeblks;
+ xfs_perag_put(pag);
+ }
+ fdblocks = bfree + bfreelst + btree;
+
+ /*
+ * If the new summary counts are obviously incorrect, fail the
+ * mount operation because that implies the AGFs are also corrupt.
+ * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
+ * will prevent xfs_repair from fixing anything.
+ */
+ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
+ xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Overwrite incore superblock counters with just-read data */
+ spin_lock(&mp->m_sb_lock);
+ sbp->sb_ifree = ifree;
+ sbp->sb_icount = ialloc;
+ sbp->sb_fdblocks = fdblocks;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_reinit_percpu_counters(mp);
+out:
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+ return error;
+}
+
+STATIC void
+__xfs_free_perag(
+ struct rcu_head *head)
+{
+ struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+ ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+ ASSERT(atomic_read(&pag->pag_ref) == 0);
+ kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+void
+xfs_free_perag(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, agno);
+ spin_unlock(&mp->m_perag_lock);
+ ASSERT(pag);
+ ASSERT(atomic_read(&pag->pag_ref) == 0);
+
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ xfs_iunlink_destroy(pag);
+ xfs_buf_hash_destroy(pag);
+
+ call_rcu(&pag->rcu_head, __xfs_free_perag);
+ }
+}
+
+int
+xfs_initialize_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount,
+ xfs_agnumber_t *maxagi)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+ xfs_agnumber_t first_initialised = NULLAGNUMBER;
+ int error;
+
+ /*
+ * Walk the current per-ag tree so we don't try to initialise AGs
+ * that already exist (growfs case). Allocate and insert all the
+ * AGs we don't find ready for initialisation.
+ */
+ for (index = 0; index < agcount; index++) {
+ pag = xfs_perag_get(mp, index);
+ if (pag) {
+ xfs_perag_put(pag);
+ continue;
+ }
+
+ pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ if (!pag) {
+ error = -ENOMEM;
+ goto out_unwind_new_pags;
+ }
+ pag->pag_agno = index;
+ pag->pag_mount = mp;
+
+ error = radix_tree_preload(GFP_NOFS);
+ if (error)
+ goto out_free_pag;
+
+ spin_lock(&mp->m_perag_lock);
+ if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+ WARN_ON_ONCE(1);
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+ error = -EEXIST;
+ goto out_free_pag;
+ }
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+
+ /* Place kernel structure only init below this point. */
+ spin_lock_init(&pag->pag_ici_lock);
+ spin_lock_init(&pag->pagb_lock);
+ spin_lock_init(&pag->pag_state_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ init_waitqueue_head(&pag->pagb_wait);
+ pag->pagb_count = 0;
+ pag->pagb_tree = RB_ROOT;
+
+ error = xfs_buf_hash_init(pag);
+ if (error)
+ goto out_remove_pag;
+
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
+
+ /* first new pag is fully initialized */
+ if (first_initialised == NULLAGNUMBER)
+ first_initialised = index;
+ }
+
+ index = xfs_set_inode_alloc(mp, agcount);
+
+ if (maxagi)
+ *maxagi = index;
+
+ mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
+ return 0;
+
+out_hash_destroy:
+ xfs_buf_hash_destroy(pag);
+out_remove_pag:
+ radix_tree_delete(&mp->m_perag_tree, index);
+out_free_pag:
+ kmem_free(pag);
+out_unwind_new_pags:
+ /* unwind any prior newly initialized pags */
+ for (index = first_initialised; index < agcount; index++) {
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ if (!pag)
+ break;
+ xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
+ kmem_free(pag);
+ }
+ return error;
+}
static int
xfs_get_aghdr_buf(
@@ -43,7 +313,6 @@ xfs_get_aghdr_buf(
if (error)
return error;
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
bp->b_bn = blkno;
bp->b_maps[0].bm_bn = blkno;
bp->b_ops = ops;
@@ -510,6 +779,7 @@ xfs_ag_shrink_space(
struct xfs_buf *agibp, *agfbp;
struct xfs_agi *agi;
struct xfs_agf *agf;
+ xfs_agblock_t aglen;
int error, err2;
ASSERT(agno == mp->m_sb.sb_agcount - 1);
@@ -524,14 +794,22 @@ xfs_ag_shrink_space(
return error;
agf = agfbp->b_addr;
+ aglen = be32_to_cpu(agi->agi_length);
/* some extra paranoid checks before we shrink the ag */
if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
return -EFSCORRUPTED;
- if (delta >= agi->agi_length)
+ if (delta >= aglen)
return -EINVAL;
- args.fsbno = XFS_AGB_TO_FSB(mp, agno,
- be32_to_cpu(agi->agi_length) - delta);
+ args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
+
+ /*
+ * Make sure that the last inode cluster cannot overlap with the new
+ * end of the AG, even if it's sparse.
+ */
+ error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
+ if (error)
+ return error;
/*
* Disable perag reservations so it doesn't cause the allocation request
@@ -646,7 +924,7 @@ xfs_ag_extend_space(
* XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
* this doesn't actually exist in the rmap btree.
*/
- error = xfs_rmap_free(tp, bp, id->agno,
+ error = xfs_rmap_free(tp, bp, bp->b_pag,
be32_to_cpu(agf->agf_length) - len,
len, &XFS_RMAP_OINFO_SKIP_UPDATE);
if (error)
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4535de1d88ea..4c6f9045baca 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -9,6 +9,142 @@
struct xfs_mount;
struct xfs_trans;
+struct xfs_perag;
+
+/*
+ * Per-ag infrastructure
+ */
+
+/* per-AG block reservation data structures*/
+struct xfs_ag_resv {
+ /* number of blocks originally reserved here */
+ xfs_extlen_t ar_orig_reserved;
+ /* number of blocks reserved here */
+ xfs_extlen_t ar_reserved;
+ /* number of blocks originally asked for */
+ xfs_extlen_t ar_asked;
+};
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection.
+ */
+struct xfs_perag {
+ struct xfs_mount *pag_mount; /* owner filesystem */
+ xfs_agnumber_t pag_agno; /* AG this structure belongs to */
+ atomic_t pag_ref; /* perag reference count */
+ char pagf_init; /* this agf's entry is initialized */
+ char pagi_init; /* this agi's entry is initialized */
+ char pagf_metadata; /* the agf is preferred to be metadata */
+ char pagi_inodeok; /* The agi is ok for inodes */
+ uint8_t pagf_levels[XFS_BTNUM_AGF];
+ /* # of levels in bno & cnt btree */
+ bool pagf_agflreset; /* agfl requires reset before use */
+ uint32_t pagf_flcount; /* count of blocks in freelist */
+ xfs_extlen_t pagf_freeblks; /* total free blocks */
+ xfs_extlen_t pagf_longest; /* longest free space */
+ uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
+ xfs_agino_t pagi_freecount; /* number of free inodes */
+ xfs_agino_t pagi_count; /* number of allocated inodes */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
+
+ int pagb_count; /* pagb slots in use */
+ uint8_t pagf_refcount_level; /* recount btree height */
+
+ /* Blocks reserved for all kinds of metadata. */
+ struct xfs_ag_resv pag_meta_resv;
+ /* Blocks reserved for the reverse mapping btree. */
+ struct xfs_ag_resv pag_rmapbt_resv;
+
+ /* -- kernel only structures below this line -- */
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold pag_state_lock before accessing this field.
+ */
+ uint16_t pag_checked;
+ uint16_t pag_sick;
+ spinlock_t pag_state_lock;
+
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
+ unsigned int pagb_gen; /* generation count for pagb_tree */
+ wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
+
+ atomic_t pagf_fstrms; /* # of filestreams active in this AG */
+
+ spinlock_t pag_ici_lock; /* incore inode cache lock */
+ struct radix_tree_root pag_ici_root; /* incore inode cache root */
+ int pag_ici_reclaimable; /* reclaimable inodes */
+ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
+ struct rhashtable pag_buf_hash;
+
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
+
+ /* background prealloc block trimming */
+ struct delayed_work pag_blockgc_work;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
+};
+
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
+ xfs_agnumber_t *maxagi);
+int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_free_perag(struct xfs_mount *mp);
+
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+ unsigned int tag);
+void xfs_perag_put(struct xfs_perag *pag);
+
+/*
+ * Perag iteration APIs
+ *
+ * XXX: for_each_perag_range() usage really needs an iterator to clean up when
+ * we terminate at end_agno because we may have taken a reference to the perag
+ * beyond end_agno. Right now callers have to be careful to catch and clean that
+ * up themselves. This is not necessary for the callers of for_each_perag() and
+ * for_each_perag_from() because they terminate at sb_agcount where there are
+ * no perag structures in tree beyond end_agno.
+ */
+#define for_each_perag_range(mp, next_agno, end_agno, pag) \
+ for ((pag) = xfs_perag_get((mp), (next_agno)); \
+ (pag) != NULL && (next_agno) <= (end_agno); \
+ (next_agno) = (pag)->pag_agno + 1, \
+ xfs_perag_put(pag), \
+ (pag) = xfs_perag_get((mp), (next_agno)))
+
+#define for_each_perag_from(mp, next_agno, pag) \
+ for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag))
+
+
+#define for_each_perag(mp, agno, pag) \
+ (agno) = 0; \
+ for_each_perag_from((mp), (agno), (pag))
+
+#define for_each_perag_tag(mp, agno, pag, tag) \
+ for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+ (pag) != NULL; \
+ (agno) = (pag)->pag_agno + 1, \
+ xfs_perag_put(pag), \
+ (pag) = xfs_perag_get_tag((mp), (agno), (tag)))
struct aghdr_init_data {
/* per ag data */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e0d5cdc57cc2..2aa2b3484c28 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -19,7 +19,7 @@
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_sb.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
@@ -250,7 +250,6 @@ xfs_ag_resv_init(
struct xfs_trans *tp)
{
struct xfs_mount *mp = pag->pag_mount;
- xfs_agnumber_t agno = pag->pag_agno;
xfs_extlen_t ask;
xfs_extlen_t used;
int error = 0, error2;
@@ -260,11 +259,11 @@ xfs_ag_resv_init(
if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0;
- error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used);
+ error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
if (error)
goto out;
- error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used);
+ error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
if (error)
goto out;
@@ -282,7 +281,7 @@ xfs_ag_resv_init(
mp->m_finobt_nores = true;
- error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
+ error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
&used);
if (error)
goto out;
@@ -300,7 +299,7 @@ xfs_ag_resv_init(
if (pag->pag_rmapbt_resv.ar_asked == 0) {
ask = used = 0;
- error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used);
+ error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
if (error)
goto out;
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index 8a8eb4bc48bb..b74b210008ea 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -18,6 +18,21 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
struct xfs_trans *tp, xfs_extlen_t len);
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ return &pag->pag_meta_resv;
+ case XFS_AG_RESV_RMAPBT:
+ return &pag->pag_rmapbt_resv;
+ default:
+ return NULL;
+ }
+}
+
/*
* RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from
* the AGFL, they are allocated one at a time and the reservation updates don't
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index af3d5f9271f6..6929157d8d6e 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -10,7 +10,6 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
@@ -24,6 +23,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
@@ -230,7 +230,7 @@ xfs_alloc_get_rec(
int *stat) /* output: success/failure */
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.agno;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
union xfs_btree_rec *rec;
int error;
@@ -776,7 +776,7 @@ xfs_alloc_cur_setup(
*/
if (!acur->cnt)
acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_CNT);
+ args->agbp, args->pag, XFS_BTNUM_CNT);
error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
if (error)
return error;
@@ -786,10 +786,10 @@ xfs_alloc_cur_setup(
*/
if (!acur->bnolt)
acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
+ args->agbp, args->pag, XFS_BTNUM_BNO);
if (!acur->bnogt)
acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
+ args->agbp, args->pag, XFS_BTNUM_BNO);
return i == 1 ? 0 : -ENOSPC;
}
@@ -1063,7 +1063,7 @@ xfs_alloc_ag_vextent_small(
if (fbno == NULLAGBLOCK)
goto out;
- xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
(args->datatype & XFS_ALLOC_NOBUSY));
if (args->datatype & XFS_ALLOC_USERDATA) {
@@ -1089,7 +1089,7 @@ xfs_alloc_ag_vextent_small(
* If we're feeding an AGFL block to something that doesn't live in the
* free space, we need to clear out the OWN_AG rmap.
*/
- error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+ error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
&XFS_RMAP_OINFO_AG);
if (error)
goto error;
@@ -1166,7 +1166,7 @@ xfs_alloc_ag_vextent(
/* if not file data, insert new block into the reverse map btree */
if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
- error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
args->agbno, args->len, &args->oinfo);
if (error)
return error;
@@ -1178,7 +1178,7 @@ xfs_alloc_ag_vextent(
if (error)
return error;
- ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+ ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
args->agbno, args->len));
}
@@ -1217,7 +1217,7 @@ xfs_alloc_ag_vextent_exact(
* Allocate/initialize a cursor for the by-number freespace btree.
*/
bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
+ args->pag, XFS_BTNUM_BNO);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1277,7 +1277,7 @@ xfs_alloc_ag_vextent_exact(
* Allocate/initialize a cursor for the by-size btree.
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
+ args->pag, XFS_BTNUM_CNT);
ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
@@ -1674,9 +1674,8 @@ restart:
* Allocate and initialize a cursor for the by-size btree.
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
+ args->pag, XFS_BTNUM_CNT);
bno_cur = NULL;
- busy = false;
/*
* Look for an entry >= maxlen+alignment-1 blocks.
@@ -1837,7 +1836,7 @@ restart:
* Allocate and initialize a cursor for the by-block tree.
*/
bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
+ args->pag, XFS_BTNUM_BNO);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1896,12 +1895,13 @@ xfs_free_ag_extent(
int haveright; /* have a right neighbor */
int i;
int error;
+ struct xfs_perag *pag = agbp->b_pag;
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
if (!xfs_rmap_should_skip_owner_update(oinfo)) {
- error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
+ error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
if (error)
goto error0;
}
@@ -1909,7 +1909,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1979,7 +1979,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+ cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -2490,7 +2490,7 @@ xfs_exact_minlen_extent_available(
int error = 0;
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
- args->agno, XFS_BTNUM_CNT);
+ args->pag, XFS_BTNUM_CNT);
error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
if (error)
goto out;
@@ -2693,21 +2693,21 @@ out_no_agbp:
* Get a block from the freelist.
* Returns with the buffer for the block gotten.
*/
-int /* error */
+int
xfs_alloc_get_freelist(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop, /* block address retrieved from freelist */
- int btreeblk) /* destination is a AGF btree */
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agblock_t *bnop,
+ int btreeblk)
{
- struct xfs_agf *agf = agbp->b_addr;
- struct xfs_buf *agflbp;/* buffer for a.g. freelist structure */
- xfs_agblock_t bno; /* block number returned */
- __be32 *agfl_bno;
- int error;
- int logflags;
- xfs_mount_t *mp = tp->t_mountp;
- xfs_perag_t *pag; /* per allocation group data */
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_buf *agflbp;
+ xfs_agblock_t bno;
+ __be32 *agfl_bno;
+ int error;
+ int logflags;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
/*
* Freelist is empty, give up.
@@ -2817,20 +2817,20 @@ xfs_alloc_pagf_init(
/*
* Put the block on the freelist for the allocation group.
*/
-int /* error */
+int
xfs_alloc_put_freelist(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for a.g. freelist header */
- struct xfs_buf *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno, /* block being freed */
- int btreeblk) /* block came from a AGF btree */
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_buf *agflbp,
+ xfs_agblock_t bno,
+ int btreeblk)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agf *agf = agbp->b_addr;
- __be32 *blockp;/* pointer to array entry */
+ struct xfs_perag *pag;
+ __be32 *blockp;
int error;
int logflags;
- xfs_perag_t *pag; /* per allocation group data */
__be32 *agfl_bno;
int startoff;
@@ -3292,7 +3292,7 @@ error0:
int
xfs_free_extent_fix_freelist(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_buf **agbp)
{
struct xfs_alloc_arg args;
@@ -3301,7 +3301,8 @@ xfs_free_extent_fix_freelist(
memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
- args.agno = agno;
+ args.agno = pag->pag_agno;
+ args.pag = pag;
/*
* validate that the block number is legal - the enables us to detect
@@ -3310,17 +3311,12 @@ xfs_free_extent_fix_freelist(
if (args.agno >= args.mp->m_sb.sb_agcount)
return -EFSCORRUPTED;
- args.pag = xfs_perag_get(args.mp, args.agno);
- ASSERT(args.pag);
-
error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
if (error)
- goto out;
+ return error;
*agbp = args.agbp;
-out:
- xfs_perag_put(args.pag);
- return error;
+ return 0;
}
/*
@@ -3344,6 +3340,7 @@ __xfs_free_extent(
struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
+ struct xfs_perag *pag;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3352,33 +3349,37 @@ __xfs_free_extent(
XFS_ERRTAG_FREE_EXTENT))
return -EIO;
- error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error)
- return error;
+ goto err;
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
error = -EFSCORRUPTED;
- goto err;
+ goto err_release;
}
/* validate the extent size is legal now we have the agf locked */
if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
- goto err;
+ goto err_release;
}
error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
- goto err;
+ goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
- xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
+ xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
+ xfs_perag_put(pag);
return 0;
-err:
+err_release:
xfs_trans_brelse(tp, agbp);
+err:
+ xfs_perag_put(pag);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index a4427c5775c2..e30900b6f8ba 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -214,7 +214,7 @@ int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
-int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_buf **agbp);
xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a43e4c50e69b..6b363f78cfa2 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
@@ -19,6 +18,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
+#include "xfs_ag.h"
STATIC struct xfs_btree_cur *
@@ -26,8 +26,7 @@ xfs_allocbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.agno,
- cur->bc_btnum);
+ cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
}
STATIC void
@@ -39,13 +38,12 @@ xfs_allocbt_set_root(
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
int btnum = cur->bc_btnum;
- struct xfs_perag *pag = agbp->b_pag;
ASSERT(ptr->s != 0);
agf->agf_roots[btnum] = ptr->s;
be32_add_cpu(&agf->agf_levels[btnum], inc);
- pag->pagf_levels[btnum] += inc;
+ cur->bc_ag.pag->pagf_levels[btnum] += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -72,7 +70,7 @@ xfs_allocbt_alloc_block(
}
atomic64_inc(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false);
new->s = cpu_to_be32(bno);
@@ -86,7 +84,6 @@ xfs_allocbt_free_block(
struct xfs_buf *bp)
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
- struct xfs_agf *agf = agbp->b_addr;
xfs_agblock_t bno;
int error;
@@ -96,7 +93,7 @@ xfs_allocbt_free_block(
return error;
atomic64_dec(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
}
@@ -225,7 +222,7 @@ xfs_allocbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
@@ -473,7 +470,7 @@ STATIC struct xfs_btree_cur *
xfs_allocbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
@@ -486,6 +483,7 @@ xfs_allocbt_init_common(
cur->bc_mp = mp;
cur->bc_btnum = btnum;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_ag.abt.active = false;
if (btnum == XFS_BTNUM_CNT) {
cur->bc_ops = &xfs_cntbt_ops;
@@ -496,8 +494,9 @@ xfs_allocbt_init_common(
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
}
- cur->bc_ag.agno = agno;
- cur->bc_ag.abt.active = false;
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
@@ -513,13 +512,13 @@ xfs_allocbt_init_cursor(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer for agf structure */
- xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_perag *pag,
xfs_btnum_t btnum) /* btree identifier */
{
struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- cur = xfs_allocbt_init_common(mp, tp, agno, btnum);
+ cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
if (btnum == XFS_BTNUM_CNT)
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
else
@@ -535,12 +534,12 @@ struct xfs_btree_cur *
xfs_allocbt_stage_cursor(
struct xfs_mount *mp,
struct xbtree_afakeroot *afake,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- cur = xfs_allocbt_init_common(mp, NULL, agno, btnum);
+ cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index a5b998e950fe..9eb4c667a6b8 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xfs_perag;
struct xbtree_afakeroot;
/*
@@ -46,11 +47,11 @@ struct xbtree_afakeroot;
(maxrecs) * sizeof(xfs_alloc_key_t) + \
((index) - 1) * sizeof(xfs_alloc_ptr_t)))
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *,
- xfs_agnumber_t, xfs_btnum_t);
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag, xfs_btnum_t btnum);
struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, xfs_agnumber_t agno,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag,
xfs_btnum_t btnum);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 96146f425e50..191d51725988 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -44,20 +44,27 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
* Internal routines when attribute list is one block.
*/
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
/*
* Internal routines when attribute list is more than one block.
*/
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
-STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
-STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
+STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac);
+STATIC int xfs_attr_node_addname_clear_incomplete(
+ struct xfs_delattr_context *dac);
STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
struct xfs_da_state **state);
STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac,
+ struct xfs_buf **leaf_bp);
+STATIC int xfs_attr_node_removename(struct xfs_da_args *args,
+ struct xfs_da_state *state);
int
xfs_inode_hasattr(
@@ -237,27 +244,77 @@ xfs_attr_is_shortform(
}
/*
- * Attempts to set an attr in shortform, or converts short form to leaf form if
- * there is not enough room. If the attr is set, the transaction is committed
- * and set to NULL.
+ * Checks to see if a delayed attribute transaction should be rolled. If so,
+ * transaction is finished or rolled as needed.
*/
STATIC int
-xfs_attr_set_shortform(
- struct xfs_da_args *args,
- struct xfs_buf **leaf_bp)
+xfs_attr_trans_roll(
+ struct xfs_delattr_context *dac)
{
- struct xfs_inode *dp = args->dp;
- int error, error2 = 0;
+ struct xfs_da_args *args = dac->da_args;
+ int error;
+
+ if (dac->flags & XFS_DAC_DEFER_FINISH) {
+ /*
+ * The caller wants us to finish all the deferred ops so that we
+ * avoid pinning the log tail with a large number of deferred
+ * ops.
+ */
+ dac->flags &= ~XFS_DAC_DEFER_FINISH;
+ error = xfs_defer_finish(&args->trans);
+ } else
+ error = xfs_trans_roll_inode(&args->trans, args->dp);
+
+ return error;
+}
+
+/*
+ * Set the attribute specified in @args.
+ */
+int
+xfs_attr_set_args(
+ struct xfs_da_args *args)
+{
+ struct xfs_buf *leaf_bp = NULL;
+ int error = 0;
+ struct xfs_delattr_context dac = {
+ .da_args = args,
+ };
+
+ do {
+ error = xfs_attr_set_iter(&dac, &leaf_bp);
+ if (error != -EAGAIN)
+ break;
+
+ error = xfs_attr_trans_roll(&dac);
+ if (error) {
+ if (leaf_bp)
+ xfs_trans_brelse(args->trans, leaf_bp);
+ return error;
+ }
+ } while (true);
+
+ return error;
+}
+
+STATIC int
+xfs_attr_sf_addname(
+ struct xfs_delattr_context *dac,
+ struct xfs_buf **leaf_bp)
+{
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
/*
* Try to add the attr to the attribute list in the inode.
*/
error = xfs_attr_try_sf_addname(dp, args);
- if (error != -ENOSPC) {
- error2 = xfs_trans_commit(args->trans);
- args->trans = NULL;
- return error ? error : error2;
- }
+
+ /* Should only be 0, -EEXIST or -ENOSPC */
+ if (error != -ENOSPC)
+ return error;
+
/*
* It won't fit in the shortform, transform to a leaf block. GROT:
* another possible req'mt for a double-split btree op.
@@ -269,85 +326,300 @@ xfs_attr_set_shortform(
/*
* Prevent the leaf buffer from being unlocked so that a concurrent AIL
* push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier. Once we're done rolling the transaction we
- * can release the hold and add the attr to the leaf.
+ * with the write verifier.
*/
xfs_trans_bhold(args->trans, *leaf_bp);
- error = xfs_defer_finish(&args->trans);
- xfs_trans_bhold_release(args->trans, *leaf_bp);
- if (error) {
- xfs_trans_brelse(args->trans, *leaf_bp);
- return error;
- }
- return 0;
+ /*
+ * We're still in XFS_DAS_UNINIT state here. We've converted
+ * the attr fork to leaf format and will restart with the leaf
+ * add.
+ */
+ dac->flags |= XFS_DAC_DEFER_FINISH;
+ return -EAGAIN;
}
/*
* Set the attribute specified in @args.
+ * This routine is meant to function as a delayed operation, and may return
+ * -EAGAIN when the transaction needs to be rolled. Calling functions will need
+ * to handle this, and recall the function until a successful error code is
+ * returned.
*/
int
-xfs_attr_set_args(
- struct xfs_da_args *args)
+xfs_attr_set_iter(
+ struct xfs_delattr_context *dac,
+ struct xfs_buf **leaf_bp)
{
- struct xfs_inode *dp = args->dp;
- struct xfs_buf *leaf_bp = NULL;
- int error = 0;
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff, error = 0;
+
+ /* State machine switch */
+ switch (dac->dela_state) {
+ case XFS_DAS_UNINIT:
+ /*
+ * If the fork is shortform, attempt to add the attr. If there
+ * is no space, this converts to leaf format and returns
+ * -EAGAIN with the leaf buffer held across the roll. The caller
+ * will deal with a transaction roll error, but otherwise
+ * release the hold once we return with a clean transaction.
+ */
+ if (xfs_attr_is_shortform(dp))
+ return xfs_attr_sf_addname(dac, leaf_bp);
+ if (*leaf_bp != NULL) {
+ xfs_trans_bhold_release(args->trans, *leaf_bp);
+ *leaf_bp = NULL;
+ }
- /*
- * If the attribute list is already in leaf format, jump straight to
- * leaf handling. Otherwise, try to add the attribute to the shortform
- * list; if there's no room then convert the list to leaf format and try
- * again.
- */
- if (xfs_attr_is_shortform(dp)) {
+ if (xfs_attr_is_leaf(dp)) {
+ error = xfs_attr_leaf_try_add(args, *leaf_bp);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
+
+ /*
+ * Finish any deferred work items and roll the
+ * transaction once more. The goal here is to
+ * call node_addname with the inode and
+ * transaction in the same state (inode locked
+ * and joined, transaction clean) no matter how
+ * we got to this step.
+ *
+ * At this point, we are still in
+ * XFS_DAS_UNINIT, but when we come back, we'll
+ * be a node, so we'll fall down into the node
+ * handling code below
+ */
+ dac->flags |= XFS_DAC_DEFER_FINISH;
+ return -EAGAIN;
+ } else if (error) {
+ return error;
+ }
+
+ dac->dela_state = XFS_DAS_FOUND_LBLK;
+ } else {
+ error = xfs_attr_node_addname_find_attr(dac);
+ if (error)
+ return error;
+
+ error = xfs_attr_node_addname(dac);
+ if (error)
+ return error;
+
+ dac->dela_state = XFS_DAS_FOUND_NBLK;
+ }
+ return -EAGAIN;
+ case XFS_DAS_FOUND_LBLK:
+ /*
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
+ */
+
+ /* Open coded xfs_attr_rmtval_set without trans handling */
+ if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) {
+ dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT;
+ if (args->rmtblkno > 0) {
+ error = xfs_attr_rmtval_find_space(dac);
+ if (error)
+ return error;
+ }
+ }
/*
- * If the attr was successfully set in shortform, the
- * transaction is committed and set to NULL. Otherwise, is it
- * converted from shortform to leaf, and the transaction is
- * retained.
+ * Repeat allocating remote blocks for the attr value until
+ * blkcnt drops to zero.
*/
- error = xfs_attr_set_shortform(args, &leaf_bp);
- if (error || !args->trans)
+ if (dac->blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(dac);
+ if (error)
+ return error;
+ return -EAGAIN;
+ }
+
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
return error;
- }
- if (xfs_attr_is_leaf(dp)) {
- error = xfs_attr_leaf_addname(args);
- if (error != -ENOSPC)
+ /*
+ * If this is not a rename, clear the incomplete flag and we're
+ * done.
+ */
+ if (!(args->op_flags & XFS_DA_OP_RENAME)) {
+ if (args->rmtblkno > 0)
+ error = xfs_attr3_leaf_clearflag(args);
return error;
+ }
/*
- * Promote the attribute list to the Btree format.
+ * If this is an atomic rename operation, we must "flip" the
+ * incomplete flags on the "new" and "old" attribute/value pairs
+ * so that one disappears and one appears atomically. Then we
+ * must remove the "old" attribute/value pair.
+ *
+ * In a separate transaction, set the incomplete flag on the
+ * "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr3_leaf_to_node(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
return error;
+ /*
+ * Commit the flag value change and start the next trans in
+ * series.
+ */
+ dac->dela_state = XFS_DAS_FLIP_LFLAG;
+ return -EAGAIN;
+ case XFS_DAS_FLIP_LFLAG:
+ /*
+ * Dismantle the "old" attribute/value pair by removing a
+ * "remote" value (if it exists).
+ */
+ xfs_attr_restore_rmt_blk(args);
+ error = xfs_attr_rmtval_invalidate(args);
+ if (error)
+ return error;
+
+ fallthrough;
+ case XFS_DAS_RM_LBLK:
+ /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
+ dac->dela_state = XFS_DAS_RM_LBLK;
+ if (args->rmtblkno) {
+ error = __xfs_attr_rmtval_remove(dac);
+ if (error)
+ return error;
+
+ dac->dela_state = XFS_DAS_RD_LEAF;
+ return -EAGAIN;
+ }
+ fallthrough;
+ case XFS_DAS_RD_LEAF:
/*
- * Finish any deferred work items and roll the transaction once
- * more. The goal here is to call node_addname with the inode
- * and transaction in the same state (inode locked and joined,
- * transaction clean) no matter how we got to this step.
+ * This is the last step for leaf format. Read the block with
+ * the old attr, remove the old attr, check for shortform
+ * conversion and return.
*/
- error = xfs_defer_finish(&args->trans);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
if (error)
return error;
+ xfs_attr3_leaf_remove(bp, args);
+
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+
+ return error;
+
+ case XFS_DAS_FOUND_NBLK:
+ /*
+ * Find space for remote blocks and fall into the allocation
+ * state.
+ */
+ if (args->rmtblkno > 0) {
+ error = xfs_attr_rmtval_find_space(dac);
+ if (error)
+ return error;
+ }
+
+ fallthrough;
+ case XFS_DAS_ALLOC_NODE:
+ /*
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
+ */
+ dac->dela_state = XFS_DAS_ALLOC_NODE;
+ if (args->rmtblkno > 0) {
+ if (dac->blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(dac);
+ if (error)
+ return error;
+ return -EAGAIN;
+ }
+
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
+ return error;
+ }
+
/*
- * Commit the current trans (including the inode) and
- * start a new one.
+ * If this was not a rename, clear the incomplete flag and we're
+ * done.
*/
- error = xfs_trans_roll_inode(&args->trans, dp);
+ if (!(args->op_flags & XFS_DA_OP_RENAME)) {
+ if (args->rmtblkno > 0)
+ error = xfs_attr3_leaf_clearflag(args);
+ goto out;
+ }
+
+ /*
+ * If this is an atomic rename operation, we must "flip" the
+ * incomplete flags on the "new" and "old" attribute/value pairs
+ * so that one disappears and one appears atomically. Then we
+ * must remove the "old" attribute/value pair.
+ *
+ * In a separate transaction, set the incomplete flag on the
+ * "old" attr and clear the incomplete flag on the "new" attr.
+ */
+ error = xfs_attr3_leaf_flipflags(args);
+ if (error)
+ goto out;
+ /*
+ * Commit the flag value change and start the next trans in
+ * series
+ */
+ dac->dela_state = XFS_DAS_FLIP_NFLAG;
+ return -EAGAIN;
+
+ case XFS_DAS_FLIP_NFLAG:
+ /*
+ * Dismantle the "old" attribute/value pair by removing a
+ * "remote" value (if it exists).
+ */
+ xfs_attr_restore_rmt_blk(args);
+
+ error = xfs_attr_rmtval_invalidate(args);
if (error)
return error;
- }
- error = xfs_attr_node_addname(args);
+ fallthrough;
+ case XFS_DAS_RM_NBLK:
+ /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
+ dac->dela_state = XFS_DAS_RM_NBLK;
+ if (args->rmtblkno) {
+ error = __xfs_attr_rmtval_remove(dac);
+ if (error)
+ return error;
+
+ dac->dela_state = XFS_DAS_CLR_FLAG;
+ return -EAGAIN;
+ }
+
+ fallthrough;
+ case XFS_DAS_CLR_FLAG:
+ /*
+ * The last state for node format. Look up the old attr and
+ * remove it.
+ */
+ error = xfs_attr_node_addname_clear_incomplete(dac);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+out:
return error;
}
+
/*
* Return EEXIST if attr is found, or ENOATTR if not
*/
@@ -382,16 +654,25 @@ xfs_has_attr(
*/
int
xfs_attr_remove_args(
- struct xfs_da_args *args)
+ struct xfs_da_args *args)
{
- if (!xfs_inode_hasattr(args->dp))
- return -ENOATTR;
+ int error;
+ struct xfs_delattr_context dac = {
+ .da_args = args,
+ };
- if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_shortform_remove(args);
- if (xfs_attr_is_leaf(args->dp))
- return xfs_attr_leaf_removename(args);
- return xfs_attr_node_removename(args);
+ do {
+ error = xfs_attr_remove_iter(&dac);
+ if (error != -EAGAIN)
+ break;
+
+ error = xfs_attr_trans_roll(&dac);
+ if (error)
+ return error;
+
+ } while (true);
+
+ return error;
}
/*
@@ -559,7 +840,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
if (retval == -EEXIST) {
if (args->attr_flags & XATTR_CREATE)
return retval;
- retval = xfs_attr_shortform_remove(args);
+ retval = xfs_attr_sf_removename(args);
if (retval)
return retval;
/*
@@ -670,115 +951,6 @@ out_brelse:
return retval;
}
-
-/*
- * Add a name to the leaf attribute list structure
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_addname(
- struct xfs_da_args *args)
-{
- int error, forkoff;
- struct xfs_buf *bp = NULL;
- struct xfs_inode *dp = args->dp;
-
- trace_xfs_attr_leaf_addname(args);
-
- error = xfs_attr_leaf_try_add(args, bp);
- if (error)
- return error;
-
- /*
- * Commit the transaction that added the attr name so that
- * later routines can manage their own transactions.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- return error;
-
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_set(args);
- if (error)
- return error;
- }
-
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- /*
- * Added a "remote" value, just clear the incomplete flag.
- */
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
-
- return error;
- }
-
- /*
- * If this is an atomic rename operation, we must "flip" the incomplete
- * flags on the "new" and "old" attribute/value pairs so that one
- * disappears and one appears atomically. Then we must remove the "old"
- * attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the "old" attr
- * and clear the incomplete flag on the "new" attr.
- */
-
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
- /*
- * Commit the flag value change and start the next trans in series.
- */
- error = xfs_trans_roll_inode(&args->trans, args->dp);
- if (error)
- return error;
-
- /*
- * Dismantle the "old" attribute/value pair by removing a "remote" value
- * (if it exists).
- */
- xfs_attr_restore_rmt_blk(args);
-
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
-
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
- }
-
- /*
- * Read in the block containing the "old" attr, then remove the "old"
- * attr from that block (neat, huh!)
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
- if (error)
- return error;
-
- xfs_attr3_leaf_remove(bp, args);
-
- /*
- * If the result is small enough, shrink it all into the inode.
- */
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff)
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
-
- return error;
-}
-
/*
* Return EEXIST if attr is found, or ENOATTR if not
*/
@@ -909,48 +1081,26 @@ xfs_attr_node_hasname(
* External routines when attribute list size > geo->blksize
*========================================================================*/
-/*
- * Add a name to a Btree-format attribute list.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- */
STATIC int
-xfs_attr_node_addname(
- struct xfs_da_args *args)
+xfs_attr_node_addname_find_attr(
+ struct xfs_delattr_context *dac)
{
- struct xfs_da_state *state;
- struct xfs_da_state_blk *blk;
- struct xfs_inode *dp;
- int retval, error;
-
- trace_xfs_attr_node_addname(args);
+ struct xfs_da_args *args = dac->da_args;
+ int retval;
/*
- * Fill in bucket of arguments/results/context to carry around.
- */
- dp = args->dp;
-restart:
- /*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- error = 0;
- retval = xfs_attr_node_hasname(args, &state);
+ retval = xfs_attr_node_hasname(args, &dac->da_state);
if (retval != -ENOATTR && retval != -EEXIST)
- goto out;
+ return retval;
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out;
+ goto error;
if (retval == -EEXIST) {
if (args->attr_flags & XATTR_CREATE)
- goto out;
+ goto error;
trace_xfs_attr_node_replace(args);
@@ -968,8 +1118,44 @@ restart:
args->rmtvaluelen = 0;
}
- retval = xfs_attr3_leaf_add(blk->bp, state->args);
- if (retval == -ENOSPC) {
+ return 0;
+error:
+ if (dac->da_state)
+ xfs_da_state_free(dac->da_state);
+ return retval;
+}
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ *
+ * This routine is meant to function as a delayed operation, and may return
+ * -EAGAIN when the transaction needs to be rolled. Calling functions will need
+ * to handle this, and recall the function until a successful error code is
+ *returned.
+ */
+STATIC int
+xfs_attr_node_addname(
+ struct xfs_delattr_context *dac)
+{
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_state *state = dac->da_state;
+ struct xfs_da_state_blk *blk;
+ int error;
+
+ trace_xfs_attr_node_addname(args);
+
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ error = xfs_attr3_leaf_add(blk->bp, state->args);
+ if (error == -ENOSPC) {
if (state->path.active == 1) {
/*
* Its really a single leaf node, but it had
@@ -981,19 +1167,16 @@ restart:
error = xfs_attr3_leaf_to_node(args);
if (error)
goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
/*
- * Commit the node conversion and start the next
- * trans in the chain.
+ * Now that we have converted the leaf to a node, we can
+ * roll the transaction, and try xfs_attr3_leaf_add
+ * again on re-entry. No need to set dela_state to do
+ * this. dela_state is still unset by this function at
+ * this point.
*/
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
-
- goto restart;
+ dac->flags |= XFS_DAC_DEFER_FINISH;
+ return -EAGAIN;
}
/*
@@ -1005,9 +1188,7 @@ restart:
error = xfs_da3_split(state);
if (error)
goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
+ dac->flags |= XFS_DAC_DEFER_FINISH;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1015,77 +1196,21 @@ restart:
xfs_da3_fixhashpath(state, &state->path);
}
- /*
- * Kill the state structure, we're done with it and need to
- * allow the buffers to come back later.
- */
- xfs_da_state_free(state);
- state = NULL;
-
- /*
- * Commit the leaf addition or btree split and start the next
- * trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
-
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_set(args);
- if (error)
- return error;
- }
-
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- /*
- * Added a "remote" value, just clear the incomplete flag.
- */
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- retval = error;
- goto out;
- }
-
- /*
- * If this is an atomic rename operation, we must "flip" the incomplete
- * flags on the "new" and "old" attribute/value pairs so that one
- * disappears and one appears atomically. Then we must remove the "old"
- * attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the "old" attr
- * and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- goto out;
- /*
- * Commit the flag value change and start the next trans in series
- */
- error = xfs_trans_roll_inode(&args->trans, args->dp);
- if (error)
- goto out;
-
- /*
- * Dismantle the "old" attribute/value pair by removing a "remote" value
- * (if it exists).
- */
- xfs_attr_restore_rmt_blk(args);
+out:
+ if (state)
+ xfs_da_state_free(state);
+ return error;
+}
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
- }
+STATIC int
+xfs_attr_node_addname_clear_incomplete(
+ struct xfs_delattr_context *dac)
+{
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_state *state = NULL;
+ int retval = 0;
+ int error = 0;
/*
* Re-find the "old" attribute entry after any split ops. The INCOMPLETE
@@ -1098,13 +1223,7 @@ restart:
if (error)
goto out;
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[state->path.active-1];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_attr_node_removename(args, state);
/*
* Check to see if the tree needs to be collapsed.
@@ -1190,14 +1309,16 @@ xfs_attr_leaf_mark_incomplete(
*/
STATIC
int xfs_attr_node_removename_setup(
- struct xfs_da_args *args,
- struct xfs_da_state **state)
+ struct xfs_delattr_context *dac)
{
- int error;
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_state **state = &dac->da_state;
+ int error;
error = xfs_attr_node_hasname(args, state);
if (error != -EEXIST)
return error;
+ error = 0;
ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
@@ -1206,97 +1327,164 @@ int xfs_attr_node_removename_setup(
if (args->rmtblkno > 0) {
error = xfs_attr_leaf_mark_incomplete(args, *state);
if (error)
- return error;
+ goto out;
- return xfs_attr_rmtval_invalidate(args);
+ error = xfs_attr_rmtval_invalidate(args);
}
+out:
+ if (error)
+ xfs_da_state_free(*state);
- return 0;
+ return error;
}
STATIC int
-xfs_attr_node_remove_rmt(
+xfs_attr_node_removename(
struct xfs_da_args *args,
struct xfs_da_state *state)
{
- int error = 0;
-
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
+ struct xfs_da_state_blk *blk;
+ int retval;
/*
- * Refill the state structure with buffers, the prior calls released our
- * buffers.
+ * Remove the name and update the hashvals in the tree.
*/
- return xfs_attr_refillstate(state);
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
+
+ return retval;
}
/*
- * Remove a name from a B-tree attribute list.
+ * Remove the attribute specified in @args.
*
* This will involve walking down the Btree, and may involve joining
* leaf nodes and even joining intermediate nodes up to and including
* the root node (a special case of an intermediate node).
+ *
+ * This routine is meant to function as either an in-line or delayed operation,
+ * and may return -EAGAIN when the transaction needs to be rolled. Calling
+ * functions will need to handle this, and call the function until a
+ * successful error code is returned.
*/
-STATIC int
-xfs_attr_node_removename(
- struct xfs_da_args *args)
+int
+xfs_attr_remove_iter(
+ struct xfs_delattr_context *dac)
{
- struct xfs_da_state *state;
- struct xfs_da_state_blk *blk;
- int retval, error;
- struct xfs_inode *dp = args->dp;
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_state *state = dac->da_state;
+ int retval, error = 0;
+ struct xfs_inode *dp = args->dp;
trace_xfs_attr_node_removename(args);
- error = xfs_attr_node_removename_setup(args, &state);
- if (error)
- goto out;
+ switch (dac->dela_state) {
+ case XFS_DAS_UNINIT:
+ if (!xfs_inode_hasattr(dp))
+ return -ENOATTR;
- /*
- * If there is an out-of-line value, de-allocate the blocks.
- * This is done before we remove the attribute so that we don't
- * overflow the maximum size of a transaction and/or hit a deadlock.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_node_remove_rmt(args, state);
- if (error)
- goto out;
- }
+ /*
+ * Shortform or leaf formats don't require transaction rolls and
+ * thus state transitions. Call the right helper and return.
+ */
+ if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_attr_sf_removename(args);
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
+ if (xfs_attr_is_leaf(dp))
+ return xfs_attr_leaf_removename(args);
- /*
- * Check to see if the tree needs to be collapsed.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
/*
- * Commit the Btree join operation and start a new trans.
+ * Node format may require transaction rolls. Set up the
+ * state context and fall into the state machine.
*/
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
- }
+ if (!dac->da_state) {
+ error = xfs_attr_node_removename_setup(dac);
+ if (error)
+ return error;
+ state = dac->da_state;
+ }
- /*
- * If the result is small enough, push it all into the inode.
- */
- if (xfs_attr_is_leaf(dp))
- error = xfs_attr_node_shrink(args, state);
+ fallthrough;
+ case XFS_DAS_RMTBLK:
+ dac->dela_state = XFS_DAS_RMTBLK;
+ /*
+ * If there is an out-of-line value, de-allocate the blocks.
+ * This is done before we remove the attribute so that we don't
+ * overflow the maximum size of a transaction and/or hit a
+ * deadlock.
+ */
+ if (args->rmtblkno > 0) {
+ /*
+ * May return -EAGAIN. Roll and repeat until all remote
+ * blocks are removed.
+ */
+ error = __xfs_attr_rmtval_remove(dac);
+ if (error == -EAGAIN)
+ return error;
+ else if (error)
+ goto out;
+
+ /*
+ * Refill the state structure with buffers (the prior
+ * calls released our buffers) and close out this
+ * transaction before proceeding.
+ */
+ ASSERT(args->rmtblkno == 0);
+ error = xfs_attr_refillstate(state);
+ if (error)
+ goto out;
+ dac->dela_state = XFS_DAS_RM_NAME;
+ dac->flags |= XFS_DAC_DEFER_FINISH;
+ return -EAGAIN;
+ }
+
+ fallthrough;
+ case XFS_DAS_RM_NAME:
+ /*
+ * If we came here fresh from a transaction roll, reattach all
+ * the buffers to the current transaction.
+ */
+ if (dac->dela_state == XFS_DAS_RM_NAME) {
+ error = xfs_attr_refillstate(state);
+ if (error)
+ goto out;
+ }
+
+ retval = xfs_attr_node_removename(args, state);
+
+ /*
+ * Check to see if the tree needs to be collapsed. If so, roll
+ * the transacton and fall into the shrink state.
+ */
+ if (retval && (state->path.active > 1)) {
+ error = xfs_da3_join(state);
+ if (error)
+ goto out;
+
+ dac->flags |= XFS_DAC_DEFER_FINISH;
+ dac->dela_state = XFS_DAS_RM_SHRINK;
+ return -EAGAIN;
+ }
+
+ fallthrough;
+ case XFS_DAS_RM_SHRINK:
+ /*
+ * If the result is small enough, push it all into the inode.
+ * This is our final state so it's safe to return a dirty
+ * transaction.
+ */
+ if (xfs_attr_is_leaf(dp))
+ error = xfs_attr_node_shrink(args, state);
+ ASSERT(error != -EAGAIN);
+ break;
+ default:
+ ASSERT(0);
+ error = -EINVAL;
+ goto out;
+ }
out:
if (state)
xfs_da_state_free(state);
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 2b1f61987a9d..8de5d1d2733e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -74,6 +74,406 @@ struct xfs_attr_list_context {
};
+/*
+ * ========================================================================
+ * Structure used to pass context around among the delayed routines.
+ * ========================================================================
+ */
+
+/*
+ * Below is a state machine diagram for attr remove operations. The XFS_DAS_*
+ * states indicate places where the function would return -EAGAIN, and then
+ * immediately resume from after being called by the calling function. States
+ * marked as a "subroutine state" indicate that they belong to a subroutine, and
+ * so the calling function needs to pass them back to that subroutine to allow
+ * it to finish where it left off. But they otherwise do not have a role in the
+ * calling function other than just passing through.
+ *
+ * xfs_attr_remove_iter()
+ * │
+ * v
+ * have attr to remove? ──n──> done
+ * │
+ * y
+ * │
+ * v
+ * are we short form? ──y──> xfs_attr_shortform_remove ──> done
+ * │
+ * n
+ * │
+ * V
+ * are we leaf form? ──y──> xfs_attr_leaf_removename ──> done
+ * │
+ * n
+ * │
+ * V
+ * ┌── need to setup state?
+ * │ │
+ * n y
+ * │ │
+ * │ v
+ * │ find attr and get state
+ * │ attr has remote blks? ──n─┐
+ * │ │ v
+ * │ │ find and invalidate
+ * │ y the remote blocks.
+ * │ │ mark attr incomplete
+ * │ ├────────────────┘
+ * └──────────┤
+ * │
+ * v
+ * Have remote blks to remove? ───y─────┐
+ * │ ^ remove the blks
+ * │ │ │
+ * │ │ v
+ * │ XFS_DAS_RMTBLK <─n── done?
+ * │ re-enter with │
+ * │ one less blk to y
+ * │ remove │
+ * │ V
+ * │ refill the state
+ * n │
+ * │ v
+ * │ XFS_DAS_RM_NAME
+ * │ │
+ * ├─────────────────────────┘
+ * │
+ * v
+ * remove leaf and
+ * update hash with
+ * xfs_attr_node_remove_cleanup
+ * │
+ * v
+ * need to
+ * shrink tree? ─n─┐
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * join leaf │
+ * │ │
+ * v │
+ * XFS_DAS_RM_SHRINK │
+ * │ │
+ * v │
+ * do the shrink │
+ * │ │
+ * v │
+ * free state <──┘
+ * │
+ * v
+ * done
+ *
+ *
+ * Below is a state machine diagram for attr set operations.
+ *
+ * It seems the challenge with understanding this system comes from trying to
+ * absorb the state machine all at once, when really one should only be looking
+ * at it with in the context of a single function. Once a state sensitive
+ * function is called, the idea is that it "takes ownership" of the
+ * state machine. It isn't concerned with the states that may have belonged to
+ * it's calling parent. Only the states relevant to itself or any other
+ * subroutines there in. Once a calling function hands off the state machine to
+ * a subroutine, it needs to respect the simple rule that it doesn't "own" the
+ * state machine anymore, and it's the responsibility of that calling function
+ * to propagate the -EAGAIN back up the call stack. Upon reentry, it is
+ * committed to re-calling that subroutine until it returns something other than
+ * -EAGAIN. Once that subroutine signals completion (by returning anything other
+ * than -EAGAIN), the calling function can resume using the state machine.
+ *
+ * xfs_attr_set_iter()
+ * │
+ * v
+ * ┌─y─ has an attr fork?
+ * │ |
+ * │ n
+ * │ |
+ * │ V
+ * │ add a fork
+ * │ │
+ * └──────────┤
+ * │
+ * V
+ * ┌─── is shortform?
+ * │ │
+ * │ y
+ * │ │
+ * │ V
+ * │ xfs_attr_set_fmt
+ * │ |
+ * │ V
+ * │ xfs_attr_try_sf_addname
+ * │ │
+ * │ V
+ * │ had enough ──y──> done
+ * │ space?
+ * n │
+ * │ n
+ * │ │
+ * │ V
+ * │ transform to leaf
+ * │ │
+ * │ V
+ * │ hold the leaf buffer
+ * │ │
+ * │ V
+ * │ return -EAGAIN
+ * │ Re-enter in
+ * │ leaf form
+ * │
+ * └─> release leaf buffer
+ * if needed
+ * │
+ * V
+ * ┌───n── fork has
+ * │ only 1 blk?
+ * │ │
+ * │ y
+ * │ │
+ * │ v
+ * │ xfs_attr_leaf_try_add()
+ * │ │
+ * │ v
+ * │ had enough ──────────────y─────────────┐
+ * │ space? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ return -EAGAIN │
+ * │ re-enter in │
+ * │ node form │
+ * │ │ │
+ * └──────────┤ │
+ * │ │
+ * V │
+ * xfs_attr_node_addname_find_attr │
+ * determines if this │
+ * is create or rename │
+ * find space to store attr │
+ * │ │
+ * v │
+ * xfs_attr_node_addname │
+ * │ │
+ * v │
+ * fits in a node leaf? ────n─────┐ │
+ * │ ^ v │
+ * │ │ single leaf node? │
+ * │ │ │ │ │
+ * y │ y n │
+ * │ │ │ │ │
+ * v │ v v │
+ * update │ grow the leaf split if │
+ * hashvals └── return -EAGAIN needed │
+ * │ retry leaf add │ │
+ * │ on reentry │ │
+ * ├────────────────────────────┘ │
+ * │ │
+ * v │
+ * need to alloc │
+ * ┌─y── or flip flag? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ done │
+ * │ │
+ * │ │
+ * │ XFS_DAS_FOUND_LBLK <────────────────┘
+ * │ │
+ * │ V
+ * │ xfs_attr_leaf_addname()
+ * │ │
+ * │ v
+ * │ ┌──first time through?
+ * │ │ │
+ * │ │ y
+ * │ │ │
+ * │ n v
+ * │ │ if we have rmt blks
+ * │ │ find space for them
+ * │ │ │
+ * │ └──────────┤
+ * │ │
+ * │ v
+ * │ still have
+ * │ ┌─n─ blks to alloc? <──┐
+ * │ │ │ │
+ * │ │ y │
+ * │ │ │ │
+ * │ │ v │
+ * │ │ alloc one blk │
+ * │ │ return -EAGAIN ──┘
+ * │ │ re-enter with one
+ * │ │ less blk to alloc
+ * │ │
+ * │ │
+ * │ └───> set the rmt
+ * │ value
+ * │ │
+ * │ v
+ * │ was this
+ * │ a rename? ──n─┐
+ * │ │ │
+ * │ y │
+ * │ │ │
+ * │ v │
+ * │ flip incomplete │
+ * │ flag │
+ * │ │ │
+ * │ v │
+ * │ XFS_DAS_FLIP_LFLAG │
+ * │ │ │
+ * │ v │
+ * │ need to remove │
+ * │ old bks? ──n──┤
+ * │ │ │
+ * │ y │
+ * │ │ │
+ * │ V │
+ * │ remove │
+ * │ ┌───> old blks │
+ * │ │ │ │
+ * │ XFS_DAS_RM_LBLK │ │
+ * │ ^ │ │
+ * │ │ v │
+ * │ └──y── more to │
+ * │ remove? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ XFS_DAS_RD_LEAF │
+ * │ │ │
+ * │ v │
+ * │ remove leaf │
+ * │ │ │
+ * │ v │
+ * │ shrink to sf │
+ * │ if needed │
+ * │ │ │
+ * │ v │
+ * │ done <──────┘
+ * │
+ * └──────> XFS_DAS_FOUND_NBLK
+ * │
+ * v
+ * ┌─────n── need to
+ * │ alloc blks?
+ * │ │
+ * │ y
+ * │ │
+ * │ v
+ * │ find space
+ * │ │
+ * │ v
+ * │ ┌─>XFS_DAS_ALLOC_NODE
+ * │ │ │
+ * │ │ v
+ * │ │ alloc blk
+ * │ │ │
+ * │ │ v
+ * │ └──y── need to alloc
+ * │ more blocks?
+ * │ │
+ * │ n
+ * │ │
+ * │ v
+ * │ set the rmt value
+ * │ │
+ * │ v
+ * │ was this
+ * └────────> a rename? ──n─┐
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * flip incomplete │
+ * flag │
+ * │ │
+ * v │
+ * XFS_DAS_FLIP_NFLAG │
+ * │ │
+ * v │
+ * need to │
+ * remove blks? ─n──┤
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * remove │
+ * ┌────────> old blks │
+ * │ │ │
+ * XFS_DAS_RM_NBLK │ │
+ * ^ │ │
+ * │ v │
+ * └──────y── more to │
+ * remove │
+ * │ │
+ * n │
+ * │ │
+ * v │
+ * XFS_DAS_CLR_FLAG │
+ * │ │
+ * v │
+ * clear flags │
+ * │ │
+ * ├──────────┘
+ * │
+ * v
+ * done
+ */
+
+/*
+ * Enum values for xfs_delattr_context.da_state
+ *
+ * These values are used by delayed attribute operations to keep track of where
+ * they were before they returned -EAGAIN. A return code of -EAGAIN signals the
+ * calling function to roll the transaction, and then call the subroutine to
+ * finish the operation. The enum is then used by the subroutine to jump back
+ * to where it was and resume executing where it left off.
+ */
+enum xfs_delattr_state {
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+ XFS_DAS_RMTBLK, /* Removing remote blks */
+ XFS_DAS_RM_NAME, /* Remove attr name */
+ XFS_DAS_RM_SHRINK, /* We are shrinking the tree */
+ XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */
+ XFS_DAS_FOUND_NBLK, /* We found node blk for attr */
+ XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */
+ XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */
+ XFS_DAS_RD_LEAF, /* Read in the new leaf */
+ XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */
+ XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */
+ XFS_DAS_RM_NBLK, /* A rename is removing node blocks */
+ XFS_DAS_CLR_FLAG, /* Clear incomplete flag */
+};
+
+/*
+ * Defines for xfs_delattr_context.flags
+ */
+#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */
+#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/
+
+/*
+ * Context used for keeping track of delayed attribute operations
+ */
+struct xfs_delattr_context {
+ struct xfs_da_args *da_args;
+
+ /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ int blkcnt;
+
+ /* Used in xfs_attr_node_removename to roll through removing blocks */
+ struct xfs_da_state *da_state;
+
+ /* Used to keep track of current state of delayed operation */
+ unsigned int flags;
+ enum xfs_delattr_state dela_state;
+};
+
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -92,6 +492,9 @@ int xfs_attr_set(struct xfs_da_args *args);
int xfs_attr_set_args(struct xfs_da_args *args);
int xfs_has_attr(struct xfs_da_args *args);
int xfs_attr_remove_args(struct xfs_da_args *args);
+int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
bool xfs_attr_namecheck(const void *name, size_t length);
+void xfs_delattr_context_init(struct xfs_delattr_context *dac,
+ struct xfs_da_args *args);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 556184b63061..b910bd209949 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -19,14 +19,15 @@
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
/*
@@ -773,7 +774,7 @@ xfs_attr_fork_remove(
* Remove an attribute from the shortform attribute list structure.
*/
int
-xfs_attr_shortform_remove(
+xfs_attr_sf_removename(
struct xfs_da_args *args)
{
struct xfs_attr_shortform *sf;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 9b1c59f40a26..efa757f1e912 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -51,7 +51,7 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
struct xfs_buf **leaf_bp);
-int xfs_attr_shortform_remove(struct xfs_da_args *args);
+int xfs_attr_sf_removename(struct xfs_da_args *args);
int xfs_attr_sf_findname(struct xfs_da_args *args,
struct xfs_attr_sf_entry **sfep,
unsigned int *basep);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 48d8e9caf86f..0c8bee3abc3b 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -439,9 +439,9 @@ xfs_attr_rmtval_get(
/*
* Find a "hole" in the attribute address space large enough for us to drop the
- * new attribute's value into
+ * new attributes value into
*/
-STATIC int
+int
xfs_attr_rmt_find_hole(
struct xfs_da_args *args)
{
@@ -468,7 +468,7 @@ xfs_attr_rmt_find_hole(
return 0;
}
-STATIC int
+int
xfs_attr_rmtval_set_value(
struct xfs_da_args *args)
{
@@ -562,69 +562,66 @@ xfs_attr_rmtval_stale(
}
/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
+ * Find a hole for the attr and store it in the delayed attr context. This
+ * initializes the context to roll through allocating an attr extent for a
+ * delayed attr operation
*/
int
-xfs_attr_rmtval_set(
- struct xfs_da_args *args)
+xfs_attr_rmtval_find_space(
+ struct xfs_delattr_context *dac)
{
- struct xfs_inode *dp = args->dp;
- struct xfs_bmbt_irec map;
- xfs_dablk_t lblkno;
- int blkcnt;
- int nmap;
- int error;
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_bmbt_irec *map = &dac->map;
+ int error;
- trace_xfs_attr_rmtval_set(args);
+ dac->lblkno = 0;
+ dac->blkcnt = 0;
+ args->rmtblkcnt = 0;
+ args->rmtblkno = 0;
+ memset(map, 0, sizeof(struct xfs_bmbt_irec));
error = xfs_attr_rmt_find_hole(args);
if (error)
return error;
- blkcnt = args->rmtblkcnt;
- lblkno = (xfs_dablk_t)args->rmtblkno;
- /*
- * Roll through the "value", allocating blocks on disk as required.
- */
- while (blkcnt > 0) {
- /*
- * Allocate a single extent, up to the size of the value.
- *
- * Note that we have to consider this a data allocation as we
- * write the remote attribute without logging the contents.
- * Hence we must ensure that we aren't using blocks that are on
- * the busy list so that we don't overwrite blocks which have
- * recently been freed but their transactions are not yet
- * committed to disk. If we overwrite the contents of a busy
- * extent and then crash then the block may not contain the
- * correct metadata after log recovery occurs.
- */
- nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
- blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map,
- &nmap);
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
+ dac->blkcnt = args->rmtblkcnt;
+ dac->lblkno = args->rmtblkno;
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
- lblkno += map.br_blockcount;
- blkcnt -= map.br_blockcount;
+ return 0;
+}
- /*
- * Start the next trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- return error;
- }
+/*
+ * Write one block of the value associated with an attribute into the
+ * out-of-line buffer that we have defined for it. This is similar to a subset
+ * of xfs_attr_rmtval_set, but records the current block to the delayed attr
+ * context, and leaves transaction handling to the caller.
+ */
+int
+xfs_attr_rmtval_set_blk(
+ struct xfs_delattr_context *dac)
+{
+ struct xfs_da_args *args = dac->da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_bmbt_irec *map = &dac->map;
+ int nmap;
+ int error;
+
+ nmap = 1;
+ error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno,
+ dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ map, &nmap);
+ if (error)
+ return error;
+
+ ASSERT(nmap == 1);
+ ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
+ (map->br_startblock != HOLESTARTBLOCK));
- return xfs_attr_rmtval_set_value(args);
+ /* roll attribute extent map forwards */
+ dac->lblkno += map->br_blockcount;
+ dac->blkcnt -= map->br_blockcount;
+
+ return 0;
}
/*
@@ -669,47 +666,17 @@ xfs_attr_rmtval_invalidate(
}
/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-int
-xfs_attr_rmtval_remove(
- struct xfs_da_args *args)
-{
- int error;
- int retval;
-
- trace_xfs_attr_rmtval_remove(args);
-
- /*
- * Keep de-allocating extents until the remote-value region is gone.
- */
- do {
- retval = __xfs_attr_rmtval_remove(args);
- if (retval && retval != -EAGAIN)
- return retval;
-
- /*
- * Close out trans and start the next one in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, args->dp);
- if (error)
- return error;
- } while (retval == -EAGAIN);
-
- return 0;
-}
-
-/*
* Remove the value associated with an attribute by deleting the out-of-line
- * buffer that it is stored on. Returns EAGAIN for the caller to refresh the
- * transaction and re-call the function
+ * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the
+ * transaction and re-call the function. Callers should keep calling this
+ * routine until it returns something other than -EAGAIN.
*/
int
__xfs_attr_rmtval_remove(
- struct xfs_da_args *args)
+ struct xfs_delattr_context *dac)
{
- int error, done;
+ struct xfs_da_args *args = dac->da_args;
+ int error, done;
/*
* Unmap value blocks for this attr.
@@ -719,12 +686,20 @@ __xfs_attr_rmtval_remove(
if (error)
return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
-
- if (!done)
+ /*
+ * We don't need an explicit state here to pick up where we left off. We
+ * can figure it out using the !done return code. The actual value of
+ * attr->xattri_dela_state may be some value reminiscent of the calling
+ * function, but it's value is irrelevant with in the context of this
+ * function. Once we are done here, the next state is set as needed by
+ * the parent
+ */
+ if (!done) {
+ dac->flags |= XFS_DAC_DEFER_FINISH;
return -EAGAIN;
+ }
- return error;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 9eee615da156..61b85b918db8 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -9,10 +9,12 @@
int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
-int xfs_attr_rmtval_set(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int __xfs_attr_rmtval_remove(struct xfs_da_args *args);
+int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a3e0e6f672d6..948092babb6a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -31,6 +31,7 @@
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
@@ -1028,7 +1029,7 @@ xfs_bmap_add_attrfork_local(
/*
* Set an inode attr fork offset based on the format of the data fork.
*/
-int
+static int
xfs_bmap_set_attrforkoff(
struct xfs_inode *ip,
int size,
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f9a390ecfb1d..67641f669918 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -187,7 +187,6 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
-int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5b6fcb9b44e2..be74a6b53689 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -21,6 +21,7 @@
#include "xfs_alloc.h"
#include "xfs_log.h"
#include "xfs_btree_staging.h"
+#include "xfs_ag.h"
/*
* Cursor allocation zone.
@@ -215,7 +216,7 @@ xfs_btree_check_sptr(
{
if (level <= 0)
return false;
- return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno);
+ return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
}
/*
@@ -244,7 +245,7 @@ xfs_btree_check_ptr(
return 0;
xfs_err(cur->bc_mp,
"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_ag.agno, cur->bc_btnum,
+ cur->bc_ag.pag->pag_agno, cur->bc_btnum,
level, index);
}
@@ -376,6 +377,8 @@ xfs_btree_del_cursor(
XFS_FORCED_SHUTDOWN(cur->bc_mp));
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
kmem_free(cur->bc_ops);
+ if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
+ xfs_perag_put(cur->bc_ag.pag);
kmem_cache_free(xfs_btree_cur_zone, cur);
}
@@ -885,13 +888,13 @@ xfs_btree_readahead_sblock(
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
left, 1, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
right, 1, cur->bc_ops->buf_ops);
rval++;
}
@@ -949,7 +952,7 @@ xfs_btree_ptr_to_daddr(
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
} else {
agbno = be32_to_cpu(ptr->s);
- *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno,
+ *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno);
}
@@ -1150,7 +1153,7 @@ xfs_btree_init_block_cur(
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
owner = cur->bc_ino.ip->i_ino;
else
- owner = cur->bc_ag.agno;
+ owner = cur->bc_ag.pag->pag_agno;
xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
cur->bc_btnum, level, numrecs,
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 10e50cbacacf..4dbdc659c396 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -11,6 +11,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_ifork;
+struct xfs_perag;
extern kmem_zone_t *xfs_btree_cur_zone;
@@ -180,11 +181,11 @@ union xfs_btree_irec {
/* Per-AG btree information. */
struct xfs_btree_cur_ag {
+ struct xfs_perag *pag;
union {
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
};
- xfs_agnumber_t agno;
union {
struct {
unsigned long nr_ops; /* # record updates */
@@ -231,6 +232,13 @@ typedef struct xfs_btree_cur
uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
int bc_statoff; /* offset of btre stats array */
+
+ /*
+ * Short btree pointers need an agno to be able to turn the pointers
+ * into physical addresses for IO, so the btree cursor switches between
+ * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
+ * cursor.
+ */
union {
struct xfs_btree_cur_ag bc_ag;
struct xfs_btree_cur_ino bc_ino;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index eefdb518fe64..aaf8805a82df 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -10,7 +10,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -27,6 +26,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
/*
* Lookup a record by ino in the btree given by cur.
@@ -105,7 +105,7 @@ xfs_inobt_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.agno;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
union xfs_btree_rec *rec;
int error;
uint64_t realfree;
@@ -172,18 +172,17 @@ xfs_inobt_insert(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_agino_t newino,
xfs_agino_t newlen,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = agbp->b_addr;
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agino_t thisino;
int i;
int error;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
for (thisino = newino;
thisino < newino + newlen;
@@ -215,10 +214,9 @@ xfs_inobt_insert(
* Verify that the number of free inodes in the AGI is correct.
*/
#ifdef DEBUG
-STATIC int
+static int
xfs_check_agi_freecount(
- struct xfs_btree_cur *cur,
- struct xfs_agi *agi)
+ struct xfs_btree_cur *cur)
{
if (cur->bc_nlevels == 1) {
xfs_inobt_rec_incore_t rec;
@@ -244,12 +242,12 @@ xfs_check_agi_freecount(
} while (i == 1);
if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
}
return 0;
}
#else
-#define xfs_check_agi_freecount(cur, agi) 0
+#define xfs_check_agi_freecount(cur) 0
#endif
/*
@@ -520,18 +518,17 @@ xfs_inobt_insert_sprec(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
int btnum,
struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
bool merge) /* merge or replace */
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = agbp->b_addr;
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
int error;
int i;
struct xfs_inobt_rec_incore rec;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -578,14 +575,14 @@ xfs_inobt_insert_sprec(
goto error;
}
- trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+ trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
rec.ir_holemask, nrec->ir_startino,
nrec->ir_holemask);
/* merge to nrec to output the updated record */
__xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+ trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
nrec->ir_holemask);
error = xfs_inobt_rec_check_count(mp, nrec);
@@ -606,28 +603,28 @@ error:
}
/*
- * Allocate new inodes in the allocation group specified by agbp.
- * Returns 0 if inodes were allocated in this AG; 1 if there was no space
- * in this AG; or the usual negative error code.
+ * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
+ * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
+ * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
+ * inode count threshold, or the usual negative error code for other errors.
*/
STATIC int
xfs_ialloc_ag_alloc(
struct xfs_trans *tp,
- struct xfs_buf *agbp)
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
{
struct xfs_agi *agi;
struct xfs_alloc_arg args;
- xfs_agnumber_t agno;
int error;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe */
/* unit boundary */
/* init. to full chunk */
- uint16_t allocmask = (uint16_t) -1;
struct xfs_inobt_rec_incore rec;
- struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
+ uint16_t allocmask = (uint16_t) -1;
int do_sparse = 0;
memset(&args, 0, sizeof(args));
@@ -660,14 +657,13 @@ xfs_ialloc_ag_alloc(
*/
agi = agbp->b_addr;
newino = be32_to_cpu(agi->agi_newino);
- agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
igeo->ialloc_blks;
if (do_sparse)
goto sparse_alloc;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.prod = 1;
@@ -727,7 +723,7 @@ xfs_ialloc_ag_alloc(
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
@@ -748,7 +744,7 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.alignment = igeo->cluster_align;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -764,7 +760,7 @@ xfs_ialloc_ag_alloc(
sparse_alloc:
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.alignment = args.mp->m_sb.sb_spino_align;
args.prod = 1;
@@ -796,7 +792,7 @@ sparse_alloc:
}
if (args.fsbno == NULLFSBLOCK)
- return 1;
+ return -EAGAIN;
ASSERT(args.len == args.minlen);
@@ -809,7 +805,7 @@ sparse_alloc:
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
args.agbno, args.len, prandom_u32());
if (error)
@@ -836,12 +832,12 @@ sparse_alloc:
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
- error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
- &rec, true);
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ XFS_BTNUM_INO, &rec, true);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
- XFS_AGINO_TO_INO(args.mp, agno,
+ XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
rec.ir_startino),
rec.ir_holemask, rec.ir_count);
xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
@@ -861,21 +857,20 @@ sparse_alloc:
* existing record with this one.
*/
if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
- XFS_BTNUM_FINO, &rec,
- false);
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ XFS_BTNUM_FINO, &rec, false);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen,
XFS_BTNUM_INO);
if (error)
return error;
if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+ error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino,
newlen, XFS_BTNUM_FINO);
if (error)
return error;
@@ -887,7 +882,6 @@ sparse_alloc:
*/
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- pag = agbp->b_pag;
pag->pagi_freecount += newlen;
pag->pagi_count += newlen;
agi->agi_newino = cpu_to_be32(newino);
@@ -905,139 +899,6 @@ sparse_alloc:
return 0;
}
-STATIC xfs_agnumber_t
-xfs_ialloc_next_ag(
- xfs_mount_t *mp)
-{
- xfs_agnumber_t agno;
-
- spin_lock(&mp->m_agirotor_lock);
- agno = mp->m_agirotor;
- if (++mp->m_agirotor >= mp->m_maxagi)
- mp->m_agirotor = 0;
- spin_unlock(&mp->m_agirotor_lock);
-
- return agno;
-}
-
-/*
- * Select an allocation group to look for a free inode in, based on the parent
- * inode and the mode. Return the allocation group buffer.
- */
-STATIC xfs_agnumber_t
-xfs_ialloc_ag_select(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t parent, /* parent directory inode number */
- umode_t mode) /* bits set to indicate file type */
-{
- xfs_agnumber_t agcount; /* number of ag's in the filesystem */
- xfs_agnumber_t agno; /* current ag number */
- int flags; /* alloc buffer locking flags */
- xfs_extlen_t ineed; /* blocks needed for inode allocation */
- xfs_extlen_t longest = 0; /* longest extent available */
- xfs_mount_t *mp; /* mount point structure */
- int needspace; /* file mode implies space allocated */
- xfs_perag_t *pag; /* per allocation group data */
- xfs_agnumber_t pagno; /* parent (starting) ag number */
- int error;
-
- /*
- * Files of these types need at least one block if length > 0
- * (and they won't fit in the inode, but that's hard to figure out).
- */
- needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
- mp = tp->t_mountp;
- agcount = mp->m_maxagi;
- if (S_ISDIR(mode))
- pagno = xfs_ialloc_next_ag(mp);
- else {
- pagno = XFS_INO_TO_AGNO(mp, parent);
- if (pagno >= agcount)
- pagno = 0;
- }
-
- ASSERT(pagno < agcount);
-
- /*
- * Loop through allocation groups, looking for one with a little
- * free space in it. Note we don't look for free inodes, exactly.
- * Instead, we include whether there is a need to allocate inodes
- * to mean that blocks must be allocated for them,
- * if none are currently free.
- */
- agno = pagno;
- flags = XFS_ALLOC_FLAG_TRYLOCK;
- for (;;) {
- pag = xfs_perag_get(mp, agno);
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto nextag;
- }
-
- if (!pag->pagi_init) {
- error = xfs_ialloc_pagi_init(mp, tp, agno);
- if (error)
- goto nextag;
- }
-
- if (pag->pagi_freecount) {
- xfs_perag_put(pag);
- return agno;
- }
-
- if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, agno, flags);
- if (error)
- goto nextag;
- }
-
- /*
- * Check that there is enough free space for the file plus a
- * chunk of inodes if we need to allocate some. If this is the
- * first pass across the AGs, take into account the potential
- * space needed for alignment of inode chunks when checking the
- * longest contiguous free space in the AG - this prevents us
- * from getting ENOSPC because we have free space larger than
- * ialloc_blks but alignment constraints prevent us from using
- * it.
- *
- * If we can't find an AG with space for full alignment slack to
- * be taken into account, we must be near ENOSPC in all AGs.
- * Hence we don't include alignment for the second pass and so
- * if we fail allocation due to alignment issues then it is most
- * likely a real ENOSPC condition.
- */
- ineed = M_IGEO(mp)->ialloc_min_blks;
- if (flags && ineed > 1)
- ineed += M_IGEO(mp)->cluster_align;
- longest = pag->pagf_longest;
- if (!longest)
- longest = pag->pagf_flcount > 0;
-
- if (pag->pagf_freeblks >= needspace + ineed &&
- longest >= ineed) {
- xfs_perag_put(pag);
- return agno;
- }
-nextag:
- xfs_perag_put(pag);
- /*
- * No point in iterating over the rest, if we're shutting
- * down.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
- return NULLAGNUMBER;
- agno++;
- if (agno >= agcount)
- agno = 0;
- if (agno == pagno) {
- if (flags == 0)
- return NULLAGNUMBER;
- flags = 0;
- }
- }
-}
-
/*
* Try to retrieve the next record to the left/right from the current one.
*/
@@ -1123,15 +984,14 @@ STATIC int
xfs_dialloc_ag_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_ino_t parent,
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agi *agi = agbp->b_addr;
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
- struct xfs_perag *pag = agbp->b_pag;
struct xfs_btree_cur *cur, *tcur;
struct xfs_inobt_rec_incore rec, trec;
xfs_ino_t ino;
@@ -1145,7 +1005,7 @@ xfs_dialloc_ag_inobt(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1153,14 +1013,14 @@ xfs_dialloc_ag_inobt(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
/*
* If in the same AG as the parent, try to get near the parent.
*/
- if (pagno == agno) {
+ if (pagno == pag->pag_agno) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
@@ -1363,7 +1223,7 @@ alloc_inode:
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
error = xfs_inobt_update(cur, &rec);
@@ -1373,7 +1233,7 @@ alloc_inode:
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
pag->pagi_freecount--;
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
@@ -1568,16 +1428,16 @@ xfs_dialloc_ag_update_inobt(
* The caller selected an AG for us, and made sure that free inodes are
* available.
*/
-int
+static int
xfs_dialloc_ag(
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_ino_t parent,
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agi *agi = agbp->b_addr;
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
struct xfs_btree_cur *cur; /* finobt cursor */
@@ -1589,7 +1449,7 @@ xfs_dialloc_ag(
int i;
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
- return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+ return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
@@ -1598,9 +1458,9 @@ xfs_dialloc_ag(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error_cur;
@@ -1609,7 +1469,7 @@ xfs_dialloc_ag(
* parent. If so, find the closest available inode to the parent. If
* not, consider the agi hint or find the first free inode in the AG.
*/
- if (agno == pagno)
+ if (pag->pag_agno == pagno)
error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
else
error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
@@ -1621,7 +1481,7 @@ xfs_dialloc_ag(
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
/*
* Modify or remove the finobt record.
@@ -1641,9 +1501,9 @@ xfs_dialloc_ag(
* the original freecount. If all is well, make the equivalent update to
* the inobt using the finobt record and offset information.
*/
- icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
- error = xfs_check_agi_freecount(icur, agi);
+ error = xfs_check_agi_freecount(icur);
if (error)
goto error_icur;
@@ -1657,14 +1517,14 @@ xfs_dialloc_ag(
*/
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- agbp->b_pag->pagi_freecount--;
+ pag->pagi_freecount--;
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
- error = xfs_check_agi_freecount(icur, agi);
+ error = xfs_check_agi_freecount(icur);
if (error)
goto error_icur;
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error_icur;
@@ -1708,54 +1568,195 @@ xfs_dialloc_roll(
/* Re-attach the quota info that we detached from prev trx. */
tp->t_dqinfo = dqinfo;
+ /*
+ * Join the buffer even on commit error so that the buffer is released
+ * when the caller cancels the transaction and doesn't have to handle
+ * this error case specially.
+ */
+ xfs_trans_bjoin(tp, agibp);
*tpp = tp;
+ return error;
+}
+
+static xfs_agnumber_t
+xfs_ialloc_next_ag(
+ xfs_mount_t *mp)
+{
+ xfs_agnumber_t agno;
+
+ spin_lock(&mp->m_agirotor_lock);
+ agno = mp->m_agirotor;
+ if (++mp->m_agirotor >= mp->m_maxagi)
+ mp->m_agirotor = 0;
+ spin_unlock(&mp->m_agirotor_lock);
+
+ return agno;
+}
+
+static bool
+xfs_dialloc_good_ag(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ umode_t mode,
+ int flags,
+ bool ok_alloc)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_extlen_t ineed;
+ xfs_extlen_t longest = 0;
+ int needspace;
+ int error;
+
+ if (!pag->pagi_inodeok)
+ return false;
+
+ if (!pag->pagi_init) {
+ error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno);
+ if (error)
+ return false;
+ }
+
+ if (pag->pagi_freecount)
+ return true;
+ if (!ok_alloc)
+ return false;
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags);
+ if (error)
+ return false;
+ }
+
+ /*
+ * Check that there is enough free space for the file plus a chunk of
+ * inodes if we need to allocate some. If this is the first pass across
+ * the AGs, take into account the potential space needed for alignment
+ * of inode chunks when checking the longest contiguous free space in
+ * the AG - this prevents us from getting ENOSPC because we have free
+ * space larger than ialloc_blks but alignment constraints prevent us
+ * from using it.
+ *
+ * If we can't find an AG with space for full alignment slack to be
+ * taken into account, we must be near ENOSPC in all AGs. Hence we
+ * don't include alignment for the second pass and so if we fail
+ * allocation due to alignment issues then it is most likely a real
+ * ENOSPC condition.
+ *
+ * XXX(dgc): this calculation is now bogus thanks to the per-ag
+ * reservations that xfs_alloc_fix_freelist() now does via
+ * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
+ * be more than large enough for the check below to succeed, but
+ * xfs_alloc_space_available() will fail because of the non-zero
+ * metadata reservation and hence we won't actually be able to allocate
+ * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
+ * because of this.
+ */
+ ineed = M_IGEO(mp)->ialloc_min_blks;
+ if (flags && ineed > 1)
+ ineed += M_IGEO(mp)->cluster_align;
+ longest = pag->pagf_longest;
+ if (!longest)
+ longest = pag->pagf_flcount > 0;
+ needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+
+ if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
+ return false;
+ return true;
+}
+
+static int
+xfs_dialloc_try_ag(
+ struct xfs_trans **tpp,
+ struct xfs_perag *pag,
+ xfs_ino_t parent,
+ xfs_ino_t *new_ino,
+ bool ok_alloc)
+{
+ struct xfs_buf *agbp;
+ xfs_ino_t ino;
+ int error;
+
+ /*
+ * Then read in the AGI buffer and recheck with the AGI buffer
+ * lock held.
+ */
+ error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp);
if (error)
return error;
- xfs_trans_bjoin(tp, agibp);
- return 0;
+
+ if (!pag->pagi_freecount) {
+ if (!ok_alloc) {
+ error = -EAGAIN;
+ goto out_release;
+ }
+
+ error = xfs_ialloc_ag_alloc(*tpp, agbp, pag);
+ if (error < 0)
+ goto out_release;
+
+ /*
+ * We successfully allocated space for an inode cluster in this
+ * AG. Roll the transaction so that we can allocate one of the
+ * new inodes.
+ */
+ ASSERT(pag->pagi_freecount > 0);
+ error = xfs_dialloc_roll(tpp, agbp);
+ if (error)
+ goto out_release;
+ }
+
+ /* Allocate an inode in the found AG */
+ error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino);
+ if (!error)
+ *new_ino = ino;
+ return error;
+
+out_release:
+ xfs_trans_brelse(*tpp, agbp);
+ return error;
}
/*
- * Select and prepare an AG for inode allocation.
+ * Allocate an on-disk inode.
*
* Mode is used to tell whether the new inode is a directory and hence where to
- * locate it.
- *
- * This function will ensure that the selected AG has free inodes available to
- * allocate from. The selected AGI will be returned locked to the caller, and it
- * will allocate more free inodes if required. If no free inodes are found or
- * can be allocated, no AGI will be returned.
+ * locate it. The on-disk inode that is allocated will be returned in @new_ino
+ * on success, otherwise an error will be set to indicate the failure (e.g.
+ * -ENOSPC).
*/
int
-xfs_dialloc_select_ag(
+xfs_dialloc(
struct xfs_trans **tpp,
xfs_ino_t parent,
umode_t mode,
- struct xfs_buf **IO_agbp)
+ xfs_ino_t *new_ino)
{
struct xfs_mount *mp = (*tpp)->t_mountp;
- struct xfs_buf *agbp;
xfs_agnumber_t agno;
- int error;
- bool noroom = false;
+ int error = 0;
xfs_agnumber_t start_agno;
struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
- bool okalloc = true;
-
- *IO_agbp = NULL;
+ bool ok_alloc = true;
+ int flags;
+ xfs_ino_t ino;
/*
- * We do not have an agbp, so select an initial allocation
- * group for inode allocation.
+ * Directories, symlinks, and regular files frequently allocate at least
+ * one block, so factor that potential expansion when we examine whether
+ * an AG has enough space for file creation.
*/
- start_agno = xfs_ialloc_ag_select(*tpp, parent, mode);
- if (start_agno == NULLAGNUMBER)
- return 0;
+ if (S_ISDIR(mode))
+ start_agno = xfs_ialloc_next_ag(mp);
+ else {
+ start_agno = XFS_INO_TO_AGNO(mp, parent);
+ if (start_agno >= mp->m_maxagi)
+ start_agno = 0;
+ }
/*
* If we have already hit the ceiling of inode blocks then clear
- * okalloc so we scan all available agi structures for a free
+ * ok_alloc so we scan all available agi structures for a free
* inode.
*
* Read rough value of mp->m_icount by percpu_counter_read_positive,
@@ -1764,8 +1765,7 @@ xfs_dialloc_select_ag(
if (igeo->maxicount &&
percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
> igeo->maxicount) {
- noroom = true;
- okalloc = false;
+ ok_alloc = false;
}
/*
@@ -1774,82 +1774,36 @@ xfs_dialloc_select_ag(
* allocation groups upward, wrapping at the end.
*/
agno = start_agno;
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
for (;;) {
pag = xfs_perag_get(mp, agno);
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto nextag;
- }
-
- if (!pag->pagi_init) {
- error = xfs_ialloc_pagi_init(mp, *tpp, agno);
- if (error)
+ if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) {
+ error = xfs_dialloc_try_ag(tpp, pag, parent,
+ &ino, ok_alloc);
+ if (error != -EAGAIN)
break;
}
- /*
- * Do a first racy fast path check if this AG is usable.
- */
- if (!pag->pagi_freecount && !okalloc)
- goto nextag;
-
- /*
- * Then read in the AGI buffer and recheck with the AGI buffer
- * lock held.
- */
- error = xfs_ialloc_read_agi(mp, *tpp, agno, &agbp);
- if (error)
- break;
-
- if (pag->pagi_freecount) {
- xfs_perag_put(pag);
- goto found_ag;
- }
-
- if (!okalloc)
- goto nextag_relse_buffer;
-
- error = xfs_ialloc_ag_alloc(*tpp, agbp);
- if (error < 0) {
- xfs_trans_brelse(*tpp, agbp);
-
- if (error == -ENOSPC)
- error = 0;
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = -EFSCORRUPTED;
break;
}
-
- if (error == 0) {
- /*
- * We successfully allocated space for an inode cluster
- * in this AG. Roll the transaction so that we can
- * allocate one of the new inodes.
- */
- ASSERT(pag->pagi_freecount > 0);
- xfs_perag_put(pag);
-
- error = xfs_dialloc_roll(tpp, agbp);
- if (error) {
- xfs_buf_relse(agbp);
- return error;
+ if (++agno == mp->m_maxagi)
+ agno = 0;
+ if (agno == start_agno) {
+ if (!flags) {
+ error = -ENOSPC;
+ break;
}
- goto found_ag;
+ flags = 0;
}
-
-nextag_relse_buffer:
- xfs_trans_brelse(*tpp, agbp);
-nextag:
xfs_perag_put(pag);
- if (++agno == mp->m_sb.sb_agcount)
- agno = 0;
- if (agno == start_agno)
- return noroom ? -ENOSPC : 0;
}
+ if (!error)
+ *new_ino = ino;
xfs_perag_put(pag);
return error;
-found_ag:
- *IO_agbp = agbp;
- return 0;
}
/*
@@ -1935,12 +1889,12 @@ xfs_difree_inobt(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
struct xfs_agi *agi = agbp->b_addr;
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int ilen;
@@ -1954,9 +1908,9 @@ xfs_difree_inobt(
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
@@ -2005,7 +1959,8 @@ xfs_difree_inobt(
struct xfs_perag *pag = agbp->b_pag;
xic->deleted = true;
- xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
@@ -2028,7 +1983,7 @@ xfs_difree_inobt(
goto error0;
}
- xfs_difree_inode_chunk(tp, agno, &rec);
+ xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
} else {
xic->deleted = false;
@@ -2044,11 +1999,11 @@ xfs_difree_inobt(
*/
be32_add_cpu(&agi->agi_freecount, 1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- agbp->b_pag->pagi_freecount++;
+ pag->pagi_freecount++;
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
@@ -2069,18 +2024,17 @@ xfs_difree_finobt(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
- struct xfs_agi *agi = agbp->b_addr;
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int offset = agino - ibtrec->ir_startino;
int error;
int i;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
@@ -2158,7 +2112,7 @@ xfs_difree_finobt(
}
out:
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error;
@@ -2178,36 +2132,33 @@ error:
*/
int
xfs_difree(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- struct xfs_icluster *xic) /* cluster info if deleted */
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_ino_t inode,
+ struct xfs_icluster *xic)
{
/* REFERENCED */
xfs_agblock_t agbno; /* block number containing inode */
struct xfs_buf *agbp; /* buffer for allocation group header */
xfs_agino_t agino; /* allocation group inode number */
- xfs_agnumber_t agno; /* allocation group number */
int error; /* error return value */
- struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_inobt_rec_incore rec;/* btree record */
- mp = tp->t_mountp;
-
/*
* Break up inode number into its components.
*/
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
- __func__, agno, mp->m_sb.sb_agcount);
+ if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
+ xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
+ __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
ASSERT(0);
return -EINVAL;
}
agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
__func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
ASSERT(0);
return -EINVAL;
}
@@ -2221,7 +2172,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
if (error) {
xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
__func__, error);
@@ -2231,7 +2182,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, xic, &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec);
if (error)
goto error0;
@@ -2239,7 +2190,7 @@ xfs_difree(
* Fix up the free inode btree.
*/
if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
- error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec);
if (error)
goto error0;
}
@@ -2254,7 +2205,7 @@ STATIC int
xfs_imap_lookup(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agino_t agino,
xfs_agblock_t agbno,
xfs_agblock_t *chunk_agbno,
@@ -2267,11 +2218,11 @@ xfs_imap_lookup(
int error;
int i;
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
- __func__, error, agno);
+ __func__, error, pag->pag_agno);
return error;
}
@@ -2281,7 +2232,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -2315,42 +2266,44 @@ xfs_imap_lookup(
*/
int
xfs_imap(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t ino, /* inode to locate */
- struct xfs_imap *imap, /* location map structure */
- uint flags) /* flags for inode btree lookup */
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t ino, /* inode to locate */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
{
- xfs_agblock_t agbno; /* block number of inode in the alloc group */
- xfs_agino_t agino; /* inode number within alloc group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agblock_t chunk_agbno; /* first block in inode chunk */
- xfs_agblock_t cluster_agbno; /* first block in inode cluster */
- int error; /* error code */
- int offset; /* index of inode in its buffer */
- xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
+ xfs_agblock_t agbno; /* block number of inode in the alloc group */
+ xfs_agino_t agino; /* inode number within alloc group */
+ xfs_agblock_t chunk_agbno; /* first block in inode chunk */
+ xfs_agblock_t cluster_agbno; /* first block in inode cluster */
+ int error; /* error code */
+ int offset; /* index of inode in its buffer */
+ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
+ struct xfs_perag *pag;
ASSERT(ino != NULLFSINO);
/*
* Split up the inode number into its parts.
*/
- agno = XFS_INO_TO_AGNO(mp, ino);
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
- ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ if (!pag || agbno >= mp->m_sb.sb_agblocks ||
+ ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ error = -EINVAL;
#ifdef DEBUG
/*
* Don't output diagnostic information for untrusted inodes
* as they can be invalid without implying corruption.
*/
if (flags & XFS_IGET_UNTRUSTED)
- return -EINVAL;
- if (agno >= mp->m_sb.sb_agcount) {
+ goto out_drop;
+ if (!pag) {
xfs_alert(mp,
"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
- __func__, agno, mp->m_sb.sb_agcount);
+ __func__, XFS_INO_TO_AGNO(mp, ino),
+ mp->m_sb.sb_agcount);
}
if (agbno >= mp->m_sb.sb_agblocks) {
xfs_alert(mp,
@@ -2358,15 +2311,15 @@ xfs_imap(
__func__, (unsigned long long)agbno,
(unsigned long)mp->m_sb.sb_agblocks);
}
- if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
xfs_alert(mp,
"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
__func__, ino,
- XFS_AGINO_TO_INO(mp, agno, agino));
+ XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
}
xfs_stack_trace();
#endif /* DEBUG */
- return -EINVAL;
+ goto out_drop;
}
/*
@@ -2377,10 +2330,10 @@ xfs_imap(
* in all cases where an untrusted inode number is passed.
*/
if (flags & XFS_IGET_UNTRUSTED) {
- error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
&chunk_agbno, &offset_agbno, flags);
if (error)
- return error;
+ goto out_drop;
goto out_map;
}
@@ -2392,11 +2345,12 @@ xfs_imap(
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
imap->im_len = XFS_FSB_TO_BB(mp, 1);
imap->im_boffset = (unsigned short)(offset <<
mp->m_sb.sb_inodelog);
- return 0;
+ error = 0;
+ goto out_drop;
}
/*
@@ -2408,10 +2362,10 @@ xfs_imap(
offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
- error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
&chunk_agbno, &offset_agbno, flags);
if (error)
- return error;
+ goto out_drop;
}
out_map:
@@ -2422,7 +2376,7 @@ out_map:
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
@@ -2439,9 +2393,14 @@ out_map:
__func__, (unsigned long long) imap->im_blkno,
(unsigned long long) imap->im_len,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
- return -EINVAL;
+ error = -EINVAL;
+ goto out_drop;
}
- return 0;
+ error = 0;
+out_drop:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2969,3 +2928,58 @@ xfs_ialloc_calc_rootino(
return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
}
+
+/*
+ * Ensure there are not sparse inode clusters that cross the new EOAG.
+ *
+ * This is a no-op for non-spinode filesystems since clusters are always fully
+ * allocated and checking the bnobt suffices. However, a spinode filesystem
+ * could have a record where the upper inodes are free blocks. If those blocks
+ * were removed from the filesystem, the inode record would extend beyond EOAG,
+ * which will be flagged as corruption.
+ */
+int
+xfs_ialloc_check_shrink(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ xfs_agblock_t new_length)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length);
+ int has;
+ int error;
+
+ if (!xfs_sb_version_hassparseinodes(&mp->m_sb))
+ return 0;
+
+ pag = xfs_perag_get(mp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO);
+
+ /* Look up the inobt record that would correspond to the new EOFS. */
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
+ if (error || !has)
+ goto out;
+
+ error = xfs_inobt_get_rec(cur, &rec, &has);
+ if (error)
+ goto out;
+
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* If the record covers inodes that would be beyond EOFS, bail out. */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
+ error = -ENOSPC;
+ goto out;
+ }
+out:
+ xfs_btree_del_cursor(cur, error);
+ xfs_perag_put(pag);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 3511086a7ae1..9a2112b4ad5e 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -33,42 +33,14 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
}
/*
- * Allocate an inode on disk.
- * Mode is used to tell whether the new inode will need space, and whether
- * it is a directory.
- *
- * There are two phases to inode allocation: selecting an AG and ensuring
- * that it contains free inodes, followed by allocating one of the free
- * inodes. xfs_dialloc_select_ag() does the former and returns a locked AGI
- * to the caller, ensuring that followup call to xfs_dialloc_ag() will
- * have free inodes to allocate from. xfs_dialloc_ag() will return the inode
- * number of the free inode we allocated.
+ * Allocate an inode on disk. Mode is used to tell whether the new inode will
+ * need space, and whether it is a directory.
*/
-int /* error */
-xfs_dialloc_select_ag(
- struct xfs_trans **tpp, /* double pointer of transaction */
- xfs_ino_t parent, /* parent inode (directory) */
- umode_t mode, /* mode bits for new inode */
- struct xfs_buf **IO_agbp);
-
-int
-xfs_dialloc_ag(
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_ino_t parent,
- xfs_ino_t *inop);
+int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+ xfs_ino_t *new_ino);
-/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int /* error */
-xfs_difree(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- struct xfs_icluster *ifree); /* cluster info if deleted */
+int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
+ xfs_ino_t ino, struct xfs_icluster *ifree);
/*
* Return the location of the inode in imap, for mapping it into a buffer.
@@ -150,4 +122,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
+int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf *agibp, xfs_agblock_t new_length);
+
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 4c5831646bd9..823a038939f8 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -20,6 +20,7 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
STATIC int
xfs_inobt_get_minrecs(
@@ -34,8 +35,7 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.agno,
- cur->bc_btnum);
+ cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
}
STATIC void
@@ -102,7 +102,7 @@ __xfs_inobt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.oinfo = XFS_RMAP_OINFO_INOBT;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
@@ -235,7 +235,7 @@ xfs_inobt_init_ptr_from_cur(
{
struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_root;
}
@@ -247,7 +247,7 @@ xfs_finobt_init_ptr_from_cur(
{
struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_free_root;
}
@@ -427,7 +427,7 @@ static struct xfs_btree_cur *
xfs_inobt_init_common(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_perag *pag,
xfs_btnum_t btnum) /* ialloc or free ino btree */
{
struct xfs_btree_cur *cur;
@@ -449,7 +449,9 @@ xfs_inobt_init_common(
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_ag.agno = agno;
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
return cur;
}
@@ -459,13 +461,13 @@ xfs_inobt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
struct xfs_agi *agi = agbp->b_addr;
- cur = xfs_inobt_init_common(mp, tp, agno, btnum);
+ cur = xfs_inobt_init_common(mp, tp, pag, btnum);
if (btnum == XFS_BTNUM_INO)
cur->bc_nlevels = be32_to_cpu(agi->agi_level);
else
@@ -479,12 +481,12 @@ struct xfs_btree_cur *
xfs_inobt_stage_cursor(
struct xfs_mount *mp,
struct xbtree_afakeroot *afake,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- cur = xfs_inobt_init_common(mp, NULL, agno, btnum);
+ cur = xfs_inobt_init_common(mp, NULL, pag, btnum);
xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
@@ -656,7 +658,7 @@ int
xfs_inobt_cur(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t which,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp)
@@ -667,11 +669,11 @@ xfs_inobt_cur(
ASSERT(*agi_bpp == NULL);
ASSERT(*curpp == NULL);
- error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
+ error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp);
if (error)
return error;
- cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which);
+ cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which);
*curpp = cur;
return 0;
}
@@ -680,7 +682,7 @@ static int
xfs_inobt_count_blocks(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
@@ -688,7 +690,7 @@ xfs_inobt_count_blocks(
struct xfs_btree_cur *cur = NULL;
int error;
- error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp);
+ error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp);
if (error)
return error;
@@ -704,14 +706,14 @@ static int
xfs_finobt_read_blocks(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp;
struct xfs_agi *agi;
int error;
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
if (error)
return error;
@@ -728,7 +730,7 @@ int
xfs_finobt_calc_reserves(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
@@ -739,14 +741,14 @@ xfs_finobt_calc_reserves(
return 0;
if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
- error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len);
+ error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len);
else
- error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO,
+ error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
&tree_len);
if (error)
return error;
- *ask += xfs_inobt_max_size(mp, agno);
+ *ask += xfs_inobt_max_size(mp, pag->pag_agno);
*used += tree_len;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 35bbd978c272..e530c82b2217 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xfs_perag;
/*
* Btree block header size depends on a superblock flag.
@@ -45,11 +46,11 @@ struct xfs_mount;
(maxrecs) * sizeof(xfs_inobt_key_t) + \
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
- xfs_btnum_t);
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *agbp,
+ struct xfs_perag *pag, xfs_btnum_t btnum);
struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, xfs_agnumber_t agno,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag,
xfs_btnum_t btnum);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
@@ -64,11 +65,11 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
#endif /* DEBUG */
int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+ struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_btnum_t btnum,
+ struct xfs_perag *pag, xfs_btnum_t btnum,
struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f3254a4f4cb4..84ea2e0af9f0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -592,27 +592,31 @@ xfs_inode_validate_extsize(
/*
* This comment describes a historic gap in this verifier function.
*
- * On older kernels, the extent size hint verifier doesn't check that
- * the extent size hint is an integer multiple of the realtime extent
- * size on a directory with both RTINHERIT and EXTSZINHERIT flags set.
- * The verifier has always enforced the alignment rule for regular
- * files with the REALTIME flag set.
+ * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this
+ * function has never checked that the extent size hint is an integer
+ * multiple of the realtime extent size. Since we allow users to set
+ * this combination on non-rt filesystems /and/ to change the rt
+ * extent size when adding a rt device to a filesystem, the net effect
+ * is that users can configure a filesystem anticipating one rt
+ * geometry and change their minds later. Directories do not use the
+ * extent size hint, so this is harmless for them.
*
* If a directory with a misaligned extent size hint is allowed to
* propagate that hint into a new regular realtime file, the result
* is that the inode cluster buffer verifier will trigger a corruption
- * shutdown the next time it is run.
+ * shutdown the next time it is run, because the verifier has always
+ * enforced the alignment rule for regular files.
*
- * Unfortunately, there could be filesystems with these misconfigured
- * directories in the wild, so we cannot add a check to this verifier
- * at this time because that will result a new source of directory
- * corruption errors when reading an existing filesystem. Instead, we
- * permit the misconfiguration to pass through the verifiers so that
- * callers of this function can correct and mitigate externally.
+ * Because we allow administrators to set a new rt extent size when
+ * adding a rt section, we cannot add a check to this verifier because
+ * that will result a new source of directory corruption errors when
+ * reading an existing filesystem. Instead, we rely on callers to
+ * decide when alignment checks are appropriate, and fix things up as
+ * needed.
*/
if (rt_flag)
- blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
else
blocksize_bytes = mp->m_sb.sb_blocksize;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 3e15ea29fb8d..2c5bcbc19264 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t;
#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
- (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
#define XLOG_HEADER_SIZE 512
@@ -414,7 +411,16 @@ struct xfs_log_dinode {
/* start of the extended dinode, writable fields */
uint32_t di_crc; /* CRC of the inode */
uint64_t di_changecount; /* number of attribute changes */
- xfs_lsn_t di_lsn; /* flush sequence */
+
+ /*
+ * The LSN we write to this field during formatting is not a reflection
+ * of the current on-disk LSN. It should never be used for recovery
+ * sequencing, nor should it be recovered into the on-disk inode at all.
+ * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
+ * for details.
+ */
+ xfs_lsn_t di_lsn;
+
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
uint8_t di_pad2[12]; /* more padding for future expansion */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 2037b9f23069..860a0c9801ba 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -22,6 +22,7 @@
#include "xfs_bit.h"
#include "xfs_refcount.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
/* Allowable refcount adjustment amounts. */
enum xfs_refc_adjust_op {
@@ -46,7 +47,7 @@ xfs_refcount_lookup_le(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -63,7 +64,7 @@ xfs_refcount_lookup_ge(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -80,7 +81,7 @@ xfs_refcount_lookup_eq(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno,
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -108,7 +109,7 @@ xfs_refcount_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.agno;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
union xfs_btree_rec *rec;
int error;
xfs_agblock_t realstart;
@@ -119,7 +120,7 @@ xfs_refcount_get_rec(
xfs_refcount_btrec_to_irec(rec, irec);
- agno = cur->bc_ag.agno;
+ agno = cur->bc_ag.pag->pag_agno;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
@@ -144,7 +145,7 @@ xfs_refcount_get_rec(
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
goto out_bad_rec;
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec);
+ trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
return 0;
out_bad_rec:
@@ -169,14 +170,14 @@ xfs_refcount_update(
union xfs_btree_rec rec;
int error;
- trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec);
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_refcount_update_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -193,7 +194,7 @@ xfs_refcount_insert(
{
int error;
- trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec);
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
@@ -208,7 +209,7 @@ xfs_refcount_insert(
out_error:
if (error)
trace_xfs_refcount_insert_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -234,7 +235,7 @@ xfs_refcount_delete(
error = -EFSCORRUPTED;
goto out_error;
}
- trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec);
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
error = -EFSCORRUPTED;
@@ -246,7 +247,7 @@ xfs_refcount_delete(
out_error:
if (error)
trace_xfs_refcount_delete_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -366,7 +367,7 @@ xfs_refcount_split_extent(
return 0;
*shape_changed = true;
- trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
&rcext, agbno);
/* Establish the right extent. */
@@ -391,7 +392,7 @@ xfs_refcount_split_extent(
out_error:
trace_xfs_refcount_split_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -411,7 +412,7 @@ xfs_refcount_merge_center_extents(
int found_rec;
trace_xfs_refcount_merge_center_extents(cur->bc_mp,
- cur->bc_ag.agno, left, center, right);
+ cur->bc_ag.pag->pag_agno, left, center, right);
/*
* Make sure the center and right extents are not in the btree.
@@ -468,7 +469,7 @@ xfs_refcount_merge_center_extents(
out_error:
trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -487,7 +488,7 @@ xfs_refcount_merge_left_extent(
int found_rec;
trace_xfs_refcount_merge_left_extent(cur->bc_mp,
- cur->bc_ag.agno, left, cleft);
+ cur->bc_ag.pag->pag_agno, left, cleft);
/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
if (cleft->rc_refcount > 1) {
@@ -530,7 +531,7 @@ xfs_refcount_merge_left_extent(
out_error:
trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -548,7 +549,7 @@ xfs_refcount_merge_right_extent(
int found_rec;
trace_xfs_refcount_merge_right_extent(cur->bc_mp,
- cur->bc_ag.agno, cright, right);
+ cur->bc_ag.pag->pag_agno, cright, right);
/*
* If the extent ending at agbno+aglen (cright) wasn't synthesized,
@@ -594,7 +595,7 @@ xfs_refcount_merge_right_extent(
out_error:
trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -679,13 +680,13 @@ xfs_refcount_find_left_extents(
cleft->rc_blockcount = aglen;
cleft->rc_refcount = 1;
}
- trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
left, cleft, agbno);
return error;
out_error:
trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -768,13 +769,13 @@ xfs_refcount_find_right_extents(
cright->rc_blockcount = aglen;
cright->rc_refcount = 1;
}
- trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
cright, right, agbno + aglen);
return error;
out_error:
trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -952,7 +953,7 @@ xfs_refcount_adjust_extents(
ext.rc_startblock - *agbno);
tmp.rc_refcount = 1 + adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.agno, &tmp);
+ cur->bc_ag.pag->pag_agno, &tmp);
/*
* Either cover the hole (increment) or
@@ -971,7 +972,7 @@ xfs_refcount_adjust_extents(
cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_ag.agno,
+ cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
xfs_bmap_add_free(cur->bc_tp, fsbno,
tmp.rc_blockcount, oinfo);
@@ -998,7 +999,7 @@ xfs_refcount_adjust_extents(
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.agno, &ext);
+ cur->bc_ag.pag->pag_agno, &ext);
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
@@ -1016,7 +1017,7 @@ xfs_refcount_adjust_extents(
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_ag.agno,
+ cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
oinfo);
@@ -1035,7 +1036,7 @@ advloop:
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1057,10 +1058,10 @@ xfs_refcount_adjust(
*new_agbno = agbno;
*new_aglen = aglen;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno, aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno, aglen);
/*
@@ -1099,7 +1100,7 @@ xfs_refcount_adjust(
return 0;
out_error:
- trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -1142,30 +1143,30 @@ xfs_refcount_finish_one(
struct xfs_btree_cur *rcur;
struct xfs_buf *agbp = NULL;
int error = 0;
- xfs_agnumber_t agno;
xfs_agblock_t bno;
xfs_agblock_t new_agbno;
unsigned long nr_ops = 0;
int shape_changes = 0;
+ struct xfs_perag *pag;
- agno = XFS_FSB_TO_AGNO(mp, startblock);
- ASSERT(agno != NULLAGNUMBER);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
bno = XFS_FSB_TO_AGBNO(mp, startblock);
trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
type, XFS_FSB_TO_AGBNO(mp, startblock),
blockcount);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_REFCOUNT_FINISH_ONE))
- return -EIO;
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
+ error = -EIO;
+ goto out_drop;
+ }
/*
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_ag.agno != agno) {
+ if (rcur != NULL && rcur->bc_ag.pag != pag) {
nr_ops = rcur->bc_ag.refc.nr_ops;
shape_changes = rcur->bc_ag.refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
@@ -1173,12 +1174,12 @@ xfs_refcount_finish_one(
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(tp->t_mountp, tp, agno,
+ error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno,
XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
- return error;
+ goto out_drop;
- rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
+ rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
rcur->bc_ag.refc.nr_ops = nr_ops;
rcur->bc_ag.refc.shape_changes = shape_changes;
}
@@ -1188,12 +1189,12 @@ xfs_refcount_finish_one(
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL);
- *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL);
- *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
break;
case XFS_REFCOUNT_ALLOC_COW:
*new_fsb = startblock + blockcount;
@@ -1210,8 +1211,10 @@ xfs_refcount_finish_one(
error = -EFSCORRUPTED;
}
if (!error && *new_len > 0)
- trace_xfs_refcount_finish_one_leftover(mp, agno, type,
+ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
bno, blockcount, new_agbno, *new_len);
+out_drop:
+ xfs_perag_put(pag);
return error;
}
@@ -1294,7 +1297,7 @@ xfs_refcount_find_shared(
int have;
int error;
- trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno, aglen);
/* By default, skip the whole range */
@@ -1374,12 +1377,12 @@ xfs_refcount_find_shared(
done:
trace_xfs_refcount_find_shared_result(cur->bc_mp,
- cur->bc_ag.agno, *fbno, *flen);
+ cur->bc_ag.pag->pag_agno, *fbno, *flen);
out_error:
if (error)
trace_xfs_refcount_find_shared_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1476,7 +1479,7 @@ xfs_refcount_adjust_cow_extents(
tmp.rc_blockcount = aglen;
tmp.rc_refcount = 1;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.agno, &tmp);
+ cur->bc_ag.pag->pag_agno, &tmp);
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1504,7 +1507,7 @@ xfs_refcount_adjust_cow_extents(
ext.rc_refcount = 0;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.agno, &ext);
+ cur->bc_ag.pag->pag_agno, &ext);
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
@@ -1520,7 +1523,7 @@ xfs_refcount_adjust_cow_extents(
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1566,7 +1569,7 @@ xfs_refcount_adjust_cow(
return 0;
out_error:
- trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -1580,7 +1583,7 @@ __xfs_refcount_cow_alloc(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno,
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
agbno, aglen);
/* Add refcount btree reservation */
@@ -1597,7 +1600,7 @@ __xfs_refcount_cow_free(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno,
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
agbno, aglen);
/* Remove refcount btree reservation */
@@ -1672,7 +1675,7 @@ xfs_refcount_recover_extent(
int
xfs_refcount_recover_cow_leftovers(
struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_trans *tp;
struct xfs_btree_cur *cur;
@@ -1704,10 +1707,10 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
if (error)
goto out_trans;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
/* Find all the leftover CoW staging extents. */
memset(&low, 0, sizeof(low));
@@ -1729,11 +1732,12 @@ xfs_refcount_recover_cow_leftovers(
if (error)
goto out_free;
- trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec);
+ trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
+ &rr->rr_rrec);
/* Free the orphan record */
agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
- fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
+ fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno);
xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 209795539c8d..9f6e9aae4da0 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -6,6 +6,13 @@
#ifndef __XFS_REFCOUNT_H__
#define __XFS_REFCOUNT_H__
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_btree_cur;
+struct xfs_bmbt_irec;
+struct xfs_refcount_irec;
+
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
@@ -50,7 +57,7 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
xfs_extlen_t len);
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
- xfs_agnumber_t agno);
+ struct xfs_perag *pag);
/*
* While we're adjusting the refcounts records of an extent, we have
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index a6ac60ae9421..92d336c17e83 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
@@ -20,13 +19,14 @@
#include "xfs_trans.h"
#include "xfs_bit.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
static struct xfs_btree_cur *
xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.agno);
+ cur->bc_ag.agbp, cur->bc_ag.pag);
}
STATIC void
@@ -65,7 +65,7 @@ xfs_refcountbt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno,
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
xfs_refc_block(args.mp));
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
@@ -74,13 +74,13 @@ xfs_refcountbt_alloc_block(
error = xfs_alloc_vextent(&args);
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
}
- ASSERT(args.agno == cur->bc_ag.agno);
+ ASSERT(args.agno == cur->bc_ag.pag->pag_agno);
ASSERT(args.len == 1);
new->s = cpu_to_be32(args.agbno);
@@ -105,7 +105,7 @@ xfs_refcountbt_free_block(
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
int error;
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
@@ -170,7 +170,7 @@ xfs_refcountbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_refcount_root;
}
@@ -316,12 +316,11 @@ static struct xfs_btree_cur *
xfs_refcountbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
cur->bc_tp = tp;
@@ -330,9 +329,12 @@ xfs_refcountbt_init_common(
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
- cur->bc_ag.agno = agno;
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
cur->bc_ag.refc.nr_ops = 0;
cur->bc_ag.refc.shape_changes = 0;
cur->bc_ops = &xfs_refcountbt_ops;
@@ -345,12 +347,12 @@ xfs_refcountbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- cur = xfs_refcountbt_init_common(mp, tp, agno);
+ cur = xfs_refcountbt_init_common(mp, tp, pag);
cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
cur->bc_ag.agbp = agbp;
return cur;
@@ -361,11 +363,11 @@ struct xfs_btree_cur *
xfs_refcountbt_stage_cursor(
struct xfs_mount *mp,
struct xbtree_afakeroot *afake,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- cur = xfs_refcountbt_init_common(mp, NULL, agno);
+ cur = xfs_refcountbt_init_common(mp, NULL, pag);
xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
@@ -450,7 +452,7 @@ int
xfs_refcountbt_calc_reserves(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
@@ -463,8 +465,7 @@ xfs_refcountbt_calc_reserves(
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
-
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
if (error)
return error;
@@ -479,7 +480,7 @@ xfs_refcountbt_calc_reserves(
* expansion. We therefore can pretend the space isn't there.
*/
if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
agblocks -= mp->m_sb.sb_logblocks;
*ask += xfs_refcountbt_max_size(mp, agblocks);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index 69dc515db671..bd9ed9e1e41f 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xfs_perag;
struct xbtree_afakeroot;
/*
@@ -46,9 +47,9 @@ struct xbtree_afakeroot;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno);
+ struct xfs_perag *pag);
struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, xfs_agnumber_t agno);
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
@@ -58,7 +59,7 @@ extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
xfs_agblock_t agblocks);
extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
- struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask,
+ struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask,
xfs_extlen_t *used);
void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 10e0cf9949a2..d1dfad0204e3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -11,6 +11,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_sb.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
@@ -21,6 +22,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_inode.h"
+#include "xfs_ag.h"
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
@@ -79,7 +81,7 @@ xfs_rmap_update(
union xfs_btree_rec rec;
int error;
- trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
irec->rm_startblock, irec->rm_blockcount,
irec->rm_owner, irec->rm_offset, irec->rm_flags);
@@ -91,7 +93,7 @@ xfs_rmap_update(
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_rmap_update_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -107,7 +109,7 @@ xfs_rmap_insert(
int i;
int error;
- trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno,
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
@@ -133,7 +135,7 @@ xfs_rmap_insert(
done:
if (error)
trace_xfs_rmap_insert_error(rcur->bc_mp,
- rcur->bc_ag.agno, error, _RET_IP_);
+ rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -149,7 +151,7 @@ xfs_rmap_delete(
int i;
int error;
- trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno,
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
@@ -170,7 +172,7 @@ xfs_rmap_delete(
done:
if (error)
trace_xfs_rmap_delete_error(rcur->bc_mp,
- rcur->bc_ag.agno, error, _RET_IP_);
+ rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -197,7 +199,7 @@ xfs_rmap_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.agno;
+ xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
union xfs_btree_rec *rec;
int error;
@@ -260,7 +262,7 @@ xfs_rmap_find_left_neighbor_helper(
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
- cur->bc_ag.agno, rec->rm_startblock,
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -312,7 +314,7 @@ xfs_rmap_find_left_neighbor(
info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
- cur->bc_ag.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_find_left_neighbor_helper, &info);
@@ -320,7 +322,7 @@ xfs_rmap_find_left_neighbor(
error = 0;
if (*stat)
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.agno, irec->rm_startblock,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner,
irec->rm_offset, irec->rm_flags);
return error;
@@ -336,7 +338,7 @@ xfs_rmap_lookup_le_range_helper(
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
- cur->bc_ag.agno, rec->rm_startblock,
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -385,14 +387,14 @@ xfs_rmap_lookup_le_range(
info.stat = stat;
trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_ag.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_lookup_le_range_helper, &info);
if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.agno, irec->rm_startblock,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner,
irec->rm_offset, irec->rm_flags);
return error;
@@ -498,7 +500,7 @@ xfs_rmap_unmap(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -522,7 +524,7 @@ xfs_rmap_unmap(
goto out_error;
}
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.agno, ltrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
ltoff = ltrec.rm_offset;
@@ -588,7 +590,7 @@ xfs_rmap_unmap(
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* exact match, simply remove the record from rmap tree */
- trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
ltrec.rm_startblock, ltrec.rm_blockcount,
ltrec.rm_owner, ltrec.rm_offset,
ltrec.rm_flags);
@@ -666,7 +668,7 @@ xfs_rmap_unmap(
else
cur->bc_rec.r.rm_offset = offset + len;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
cur->bc_rec.r.rm_startblock,
cur->bc_rec.r.rm_blockcount,
cur->bc_rec.r.rm_owner,
@@ -678,11 +680,11 @@ xfs_rmap_unmap(
}
out_done:
- trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -694,7 +696,7 @@ int
xfs_rmap_free(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo)
@@ -706,7 +708,7 @@ xfs_rmap_free(
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
- cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
@@ -773,7 +775,7 @@ xfs_rmap_map(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
@@ -795,7 +797,7 @@ xfs_rmap_map(
goto out_error;
}
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.agno, ltrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
@@ -831,7 +833,7 @@ xfs_rmap_map(
goto out_error;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.agno, gtrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
@@ -870,7 +872,7 @@ xfs_rmap_map(
* result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
*/
ltrec.rm_blockcount += gtrec.rm_blockcount;
- trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
gtrec.rm_startblock,
gtrec.rm_blockcount,
gtrec.rm_owner,
@@ -921,7 +923,7 @@ xfs_rmap_map(
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
owner, offset, flags);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -932,11 +934,11 @@ xfs_rmap_map(
}
}
- trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_map_error(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -948,7 +950,7 @@ int
xfs_rmap_alloc(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo)
@@ -960,7 +962,7 @@ xfs_rmap_alloc(
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
- cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1010,7 +1012,7 @@ xfs_rmap_convert(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -1034,7 +1036,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.agno, PREV.rm_startblock,
+ cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
PREV.rm_offset, PREV.rm_flags);
@@ -1076,7 +1078,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.agno, LEFT.rm_startblock,
+ cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
LEFT.rm_blockcount, LEFT.rm_owner,
LEFT.rm_offset, LEFT.rm_flags);
if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
@@ -1114,7 +1116,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.agno, RIGHT.rm_startblock,
+ cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (bno + len == RIGHT.rm_startblock &&
@@ -1132,7 +1134,7 @@ xfs_rmap_convert(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
_RET_IP_);
/* reset the cursor back to PREV */
@@ -1162,7 +1164,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
@@ -1180,7 +1182,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
@@ -1210,7 +1212,7 @@ xfs_rmap_convert(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
@@ -1247,7 +1249,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
@@ -1326,7 +1328,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1383,7 +1385,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1414,7 +1416,7 @@ xfs_rmap_convert(
NEW = PREV;
NEW.rm_blockcount = offset - PREV.rm_offset;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_ag.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
NEW.rm_startblock, NEW.rm_blockcount,
NEW.rm_owner, NEW.rm_offset,
NEW.rm_flags);
@@ -1441,7 +1443,7 @@ xfs_rmap_convert(
/* new middle extent - newext */
cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
cur->bc_rec.r.rm_flags |= newext;
- trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1465,12 +1467,12 @@ xfs_rmap_convert(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1506,7 +1508,7 @@ xfs_rmap_convert_shared(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -1573,7 +1575,7 @@ xfs_rmap_convert_shared(
goto done;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.agno, RIGHT.rm_startblock,
+ cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1589,7 +1591,7 @@ xfs_rmap_convert_shared(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
_RET_IP_);
/*
* Switch out based on the FILLING and CONTIG state bits.
@@ -1880,12 +1882,12 @@ xfs_rmap_convert_shared(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1923,7 +1925,7 @@ xfs_rmap_unmap_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -2072,12 +2074,12 @@ xfs_rmap_unmap_shared(
goto out_error;
}
- trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_unmap_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -2112,7 +2114,7 @@ xfs_rmap_map_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/* Is there a left record that abuts our range? */
@@ -2138,7 +2140,7 @@ xfs_rmap_map_shared(
goto out_error;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.agno, gtrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
@@ -2231,12 +2233,12 @@ xfs_rmap_map_shared(
goto out_error;
}
- trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_map_error(cur->bc_mp,
- cur->bc_ag.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -2362,31 +2364,32 @@ xfs_rmap_finish_one(
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
struct xfs_btree_cur *rcur;
struct xfs_buf *agbp = NULL;
int error = 0;
- xfs_agnumber_t agno;
struct xfs_owner_info oinfo;
xfs_agblock_t bno;
bool unwritten;
- agno = XFS_FSB_TO_AGNO(mp, startblock);
- ASSERT(agno != NULLAGNUMBER);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
bno = XFS_FSB_TO_AGBNO(mp, startblock);
- trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork,
+ trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
startoff, blockcount, state);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_RMAP_FINISH_ONE))
- return -EIO;
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
+ error = -EIO;
+ goto out_drop;
+ }
+
/*
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_ag.agno != agno) {
+ if (rcur != NULL && rcur->bc_ag.pag != pag) {
xfs_rmap_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -2397,13 +2400,15 @@ xfs_rmap_finish_one(
* rmapbt, because a shape change could cause us to
* allocate blocks.
*/
- error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error)
- return error;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
- return -EFSCORRUPTED;
+ goto out_drop;
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ error = -EFSCORRUPTED;
+ goto out_drop;
+ }
- rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
}
*pcur = rcur;
@@ -2441,6 +2446,8 @@ xfs_rmap_finish_one(
ASSERT(0);
error = -EFSCORRUPTED;
}
+out_drop:
+ xfs_perag_put(pag);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index abe633403fd1..f2423cf7f1e2 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -6,6 +6,8 @@
#ifndef __XFS_RMAP_H__
#define __XFS_RMAP_H__
+struct xfs_perag;
+
static inline void
xfs_rmap_ino_bmbt_owner(
struct xfs_owner_info *oi,
@@ -113,10 +115,10 @@ xfs_owner_info_pack(
}
int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
const struct xfs_owner_info *oinfo);
int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 9f5bcbd834c3..f29bc71b9950 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
@@ -20,6 +19,7 @@
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
@@ -52,7 +52,7 @@ xfs_rmapbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.agno);
+ cur->bc_ag.agbp, cur->bc_ag.pag);
}
STATIC void
@@ -64,13 +64,12 @@ xfs_rmapbt_set_root(
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
int btnum = cur->bc_btnum;
- struct xfs_perag *pag = agbp->b_pag;
ASSERT(ptr->s != 0);
agf->agf_roots[btnum] = ptr->s;
be32_add_cpu(&agf->agf_levels[btnum], inc);
- pag->pagf_levels[btnum] += inc;
+ cur->bc_ag.pag->pagf_levels[btnum] += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -84,6 +83,7 @@ xfs_rmapbt_alloc_block(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = cur->bc_ag.pag;
int error;
xfs_agblock_t bno;
@@ -93,21 +93,19 @@ xfs_rmapbt_alloc_block(
if (error)
return error;
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
- bno, 1);
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1,
- false);
+ xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
new->s = cpu_to_be32(bno);
be32_add_cpu(&agf->agf_rmap_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno);
+ xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
*stat = 1;
return 0;
@@ -120,12 +118,12 @@ xfs_rmapbt_free_block(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = cur->bc_ag.pag;
xfs_agblock_t bno;
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno,
+ trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
@@ -133,10 +131,9 @@ xfs_rmapbt_free_block(
if (error)
return error;
- xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
- pag = cur->bc_ag.agbp->b_pag;
xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
return 0;
}
@@ -215,7 +212,7 @@ xfs_rmapbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
@@ -450,7 +447,7 @@ static struct xfs_btree_cur *
xfs_rmapbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
@@ -462,9 +459,12 @@ xfs_rmapbt_init_common(
cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
- cur->bc_ag.agno = agno;
cur->bc_ops = &xfs_rmapbt_ops;
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
return cur;
}
@@ -474,12 +474,12 @@ xfs_rmapbt_init_cursor(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- cur = xfs_rmapbt_init_common(mp, tp, agno);
+ cur = xfs_rmapbt_init_common(mp, tp, pag);
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
cur->bc_ag.agbp = agbp;
return cur;
@@ -490,11 +490,11 @@ struct xfs_btree_cur *
xfs_rmapbt_stage_cursor(
struct xfs_mount *mp,
struct xbtree_afakeroot *afake,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_btree_cur *cur;
- cur = xfs_rmapbt_init_common(mp, NULL, agno);
+ cur = xfs_rmapbt_init_common(mp, NULL, pag);
xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
@@ -596,7 +596,7 @@ int
xfs_rmapbt_calc_reserves(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
@@ -609,7 +609,7 @@ xfs_rmapbt_calc_reserves(
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
if (error)
return error;
@@ -624,7 +624,7 @@ xfs_rmapbt_calc_reserves(
* expansion. We therefore can pretend the space isn't there.
*/
if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
agblocks -= mp->m_sb.sb_logblocks;
/* Reserve 1% of the AG or enough for 1 block per record. */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 115c3455a734..88d8d18788a2 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -43,9 +43,9 @@ struct xbtree_afakeroot;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
- xfs_agnumber_t agno);
+ struct xfs_perag *pag);
struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, xfs_agnumber_t agno);
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag);
void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
@@ -57,6 +57,6 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
xfs_agblock_t agblocks);
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+ struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index dfbbcbd448c1..04f5386446db 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -15,7 +15,6 @@
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_bmap_btree.h"
@@ -25,72 +24,12 @@
#include "xfs_refcount_btree.h"
#include "xfs_da_format.h"
#include "xfs_health.h"
+#include "xfs_ag.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
*/
-/*
- * Reference counting access wrappers to the perag structures.
- * Because we never free per-ag structures, the only thing we
- * have to protect against changes is the tree structure itself.
- */
-struct xfs_perag *
-xfs_perag_get(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ref = 0;
-
- rcu_read_lock();
- pag = radix_tree_lookup(&mp->m_perag_tree, agno);
- if (pag) {
- ASSERT(atomic_read(&pag->pag_ref) >= 0);
- ref = atomic_inc_return(&pag->pag_ref);
- }
- rcu_read_unlock();
- trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
- return pag;
-}
-
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
- struct xfs_mount *mp,
- xfs_agnumber_t first,
- int tag)
-{
- struct xfs_perag *pag;
- int found;
- int ref;
-
- rcu_read_lock();
- found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
- (void **)&pag, first, 1, tag);
- if (found <= 0) {
- rcu_read_unlock();
- return NULL;
- }
- ref = atomic_inc_return(&pag->pag_ref);
- rcu_read_unlock();
- trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
- return pag;
-}
-
-void
-xfs_perag_put(
- struct xfs_perag *pag)
-{
- int ref;
-
- ASSERT(atomic_read(&pag->pag_ref) > 0);
- ref = atomic_dec_return(&pag->pag_ref);
- trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
-}
-
/* Check all the superblock fields we care about when reading one in. */
STATIC int
xfs_validate_sb_read(
@@ -842,78 +781,6 @@ xfs_sb_mount_common(
}
/*
- * xfs_initialize_perag_data
- *
- * Read in each per-ag structure so we can count up the number of
- * allocated inodes, free inodes and used filesystem blocks as this
- * information is no longer persistent in the superblock. Once we have
- * this information, write it into the in-core superblock structure.
- */
-int
-xfs_initialize_perag_data(
- struct xfs_mount *mp,
- xfs_agnumber_t agcount)
-{
- xfs_agnumber_t index;
- xfs_perag_t *pag;
- xfs_sb_t *sbp = &mp->m_sb;
- uint64_t ifree = 0;
- uint64_t ialloc = 0;
- uint64_t bfree = 0;
- uint64_t bfreelst = 0;
- uint64_t btree = 0;
- uint64_t fdblocks;
- int error = 0;
-
- for (index = 0; index < agcount; index++) {
- /*
- * read the agf, then the agi. This gets us
- * all the information we need and populates the
- * per-ag structures for us.
- */
- error = xfs_alloc_pagf_init(mp, NULL, index, 0);
- if (error)
- return error;
-
- error = xfs_ialloc_pagi_init(mp, NULL, index);
- if (error)
- return error;
- pag = xfs_perag_get(mp, index);
- ifree += pag->pagi_freecount;
- ialloc += pag->pagi_count;
- bfree += pag->pagf_freeblks;
- bfreelst += pag->pagf_flcount;
- btree += pag->pagf_btreeblks;
- xfs_perag_put(pag);
- }
- fdblocks = bfree + bfreelst + btree;
-
- /*
- * If the new summary counts are obviously incorrect, fail the
- * mount operation because that implies the AGFs are also corrupt.
- * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
- * will prevent xfs_repair from fixing anything.
- */
- if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
- xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
- error = -EFSCORRUPTED;
- goto out;
- }
-
- /* Overwrite incore superblock counters with just-read data */
- spin_lock(&mp->m_sb_lock);
- sbp->sb_ifree = ifree;
- sbp->sb_icount = ialloc;
- sbp->sb_fdblocks = fdblocks;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_reinit_percpu_counters(mp);
-out:
- xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
- return error;
-}
-
-/*
* xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
* into the superblock buffer to be logged. It does not provide the higher
* level of locking that is needed to protect the in-core superblock from
@@ -989,17 +856,18 @@ int
xfs_update_secondary_sbs(
struct xfs_mount *mp)
{
- xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = 1;
int saved_error = 0;
int error = 0;
LIST_HEAD (buffer_list);
/* update secondary superblocks. */
- for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
struct xfs_buf *bp;
error = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
XFS_FSS_TO_BB(mp, 1), &bp);
/*
* If we get an error reading or writing alternate superblocks,
@@ -1011,7 +879,7 @@ xfs_update_secondary_sbs(
if (error) {
xfs_warn(mp,
"error allocating secondary superblock for ag %d",
- agno);
+ pag->pag_agno);
if (!saved_error)
saved_error = error;
continue;
@@ -1032,7 +900,7 @@ xfs_update_secondary_sbs(
if (error) {
xfs_warn(mp,
"write error %d updating a secondary superblock near ag %d",
- error, agno);
+ error, pag->pag_agno);
if (!saved_error)
saved_error = error;
continue;
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index f79f9dc632b6..0c1602d9b53d 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -13,15 +13,6 @@ struct xfs_trans;
struct xfs_fsop_geom;
struct xfs_perag;
-/*
- * perag get/put wrappers for ref counting
- */
-extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
-extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
- int tag);
-extern void xfs_perag_put(struct xfs_perag *pag);
-extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
extern int xfs_sync_sb_buf(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 782fdd08f759..25c4cab58851 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -22,30 +22,26 @@ struct xfs_inode;
* Buffer verifier operations are widely used, including userspace tools
*/
extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
-extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
-extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
-extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_finobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-extern const struct xfs_buf_ops xfs_rtbuf_ops;
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 8d595a5c4abd..16f723ebe8dd 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -143,16 +143,14 @@ xfs_trans_log_inode(
}
/*
- * Inode verifiers on older kernels don't check that the extent size
- * hint is an integer multiple of the rt extent size on a directory
- * with both rtinherit and extszinherit flags set. If we're logging a
- * directory that is misconfigured in this way, clear the hint.
+ * Inode verifiers do not check that the extent size hint is an integer
+ * multiple of the rt extent size on a directory with both rtinherit
+ * and extszinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the hint.
*/
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
- xfs_info_once(ip->i_mount,
- "Correcting misaligned extent size hint in inode 0x%llx.", ip->i_ino);
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
XFS_DIFLAG_EXTSZINHERIT);
ip->i_extsize = 0;
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 04801362e1a7..e8f4abee7892 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -11,6 +11,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
/* Find the size of the AG, in blocks. */
inline xfs_agblock_t
@@ -222,12 +223,13 @@ xfs_icount_range(
unsigned long long *max)
{
unsigned long long nr_inos = 0;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
/* root, rtbitmap, rtsum all live in the first chunk */
*min = XFS_INODES_PER_CHUNK;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag(mp, agno, pag) {
xfs_agino_t first, last;
xfs_agino_range(mp, agno, &first, &last);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 064bd6e8c922..0870ef6f933d 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
+typedef int64_t xfs_csn_t; /* CIL sequence number */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index f96e84793cc9..be1a7e1e65f7 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -14,6 +14,7 @@
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 23690f824ffa..e95f8c98f0f7 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -20,6 +20,7 @@
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -245,8 +246,8 @@ xrep_agf_calc_from_btrees(
int error;
/* Update the AGF counters from the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_BNO);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
if (error)
goto err;
@@ -259,8 +260,8 @@ xrep_agf_calc_from_btrees(
agf->agf_longest = cpu_to_be32(raa.longest);
/* Update the AGF counters from the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_CNT);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -268,7 +269,7 @@ xrep_agf_calc_from_btrees(
btreeblks += blocks - 1;
/* Update the AGF counters from the rmapbt. */
- cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -281,7 +282,7 @@ xrep_agf_calc_from_btrees(
/* Update the AGF counters from the refcountbt. */
if (xfs_sb_version_hasreflink(&mp->m_sb)) {
cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.agno);
+ sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -453,7 +454,7 @@ xrep_agfl_walk_rmap(
/* Record all the OWN_AG blocks. */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
- fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno,
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
rec->rm_startblock);
error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
if (error)
@@ -489,23 +490,23 @@ xrep_agfl_collect_blocks(
xbitmap_init(&ra.agmetablocks);
/* Find all space used by the free space btrees & rmapbt. */
- cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
if (error)
goto err;
xfs_btree_del_cursor(cur, error);
/* Find all blocks currently being used by the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_BNO);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
xfs_btree_del_cursor(cur, error);
/* Find all blocks currently being used by the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_CNT);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
@@ -805,8 +806,8 @@ xrep_agi_calc_from_btrees(
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
- XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
+ sc->sa.pag, XFS_BTNUM_INO);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
@@ -827,8 +828,8 @@ xrep_agi_calc_from_btrees(
xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
xfs_agblock_t blocks;
- cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
- XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
+ sc->sa.pag, XFS_BTNUM_FINO);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 2720bd7fe53b..d5741980094a 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -15,6 +15,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub free space btrees.
@@ -93,7 +94,7 @@ xchk_allocbt_rec(
union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_ag.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t bno;
xfs_extlen_t len;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 77d5c4a0f09f..1d146c9d9de1 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -22,6 +22,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/* Set us up with an inode's bmap. */
int
@@ -514,7 +515,7 @@ xchk_bmap_check_rmap(
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_ag.agno, rec->rm_startblock))
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
if (irec.br_blockcount > rec->rm_blockcount)
@@ -544,18 +545,18 @@ STATIC int
xchk_bmap_check_ag_rmaps(
struct xfs_scrub *sc,
int whichfork,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xchk_bmap_check_rmap_info sbcri;
struct xfs_btree_cur *cur;
struct xfs_buf *agf;
int error;
- error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf);
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf);
if (error)
return error;
- cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno);
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, pag);
sbcri.sc = sc;
sbcri.whichfork = whichfork;
@@ -575,6 +576,7 @@ xchk_bmap_check_rmaps(
int whichfork)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
bool zero_size;
int error;
@@ -607,15 +609,16 @@ xchk_bmap_check_rmaps(
(zero_size || ifp->if_nextents > 0))
return 0;
- for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) {
- error = xchk_bmap_check_ag_rmaps(sc, whichfork, agno);
+ for_each_perag(sc->mp, agno, pag) {
+ error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error)
- return error;
+ break;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
break;
}
-
- return 0;
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 6cc92291c46f..8558ca05e11d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -12,7 +12,6 @@
#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_alloc.h"
@@ -26,6 +25,7 @@
#include "xfs_trans_priv.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -460,49 +460,48 @@ xchk_ag_btcur_init(
struct xchk_ag *sa)
{
struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sa->agno;
xchk_perag_get(sc->mp, sa);
if (sa->agf_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
/* Set up a bnobt cursor for cross-referencing. */
sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- agno, XFS_BTNUM_BNO);
+ sa->pag, XFS_BTNUM_BNO);
}
if (sa->agf_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- agno, XFS_BTNUM_CNT);
+ sa->pag, XFS_BTNUM_CNT);
}
/* Set up a inobt cursor for cross-referencing. */
if (sa->agi_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
- agno, XFS_BTNUM_INO);
+ sa->pag, XFS_BTNUM_INO);
}
/* Set up a finobt cursor for cross-referencing. */
if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
- agno, XFS_BTNUM_FINO);
+ sa->pag, XFS_BTNUM_FINO);
}
/* Set up a rmapbt cursor for cross-referencing. */
if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
- agno);
+ sa->pag);
}
/* Set up a refcountbt cursor for cross-referencing. */
if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
- sa->agf_bp, agno);
+ sa->agf_bp, sa->pag);
}
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index f1d1a8c58853..fd7941e04ae1 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -9,11 +9,11 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_health.h"
#include "xfs_btree.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -71,11 +71,11 @@ xchk_fscount_warmup(
xfs_agnumber_t agno;
int error = 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
-
+ for_each_perag(mp, agno, pag) {
+ if (xchk_should_terminate(sc, &error))
+ break;
if (pag->pagi_init && pag->pagf_init)
- goto next_loop_perag;
+ continue;
/* Lock both AG headers. */
error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
@@ -89,21 +89,15 @@ xchk_fscount_warmup(
* These are supposed to be initialized by the header read
* function.
*/
- error = -EFSCORRUPTED;
- if (!pag->pagi_init || !pag->pagf_init)
+ if (!pag->pagi_init || !pag->pagf_init) {
+ error = -EFSCORRUPTED;
break;
+ }
xfs_buf_relse(agf_bp);
agf_bp = NULL;
xfs_buf_relse(agi_bp);
agi_bp = NULL;
-next_loop_perag:
- xfs_perag_put(pag);
- pag = NULL;
- error = 0;
-
- if (xchk_should_terminate(sc, &error))
- break;
}
if (agf_bp)
@@ -196,13 +190,14 @@ retry:
fsc->ifree = 0;
fsc->fdblocks = 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
+ if (xchk_should_terminate(sc, &error))
+ break;
/* This somehow got unset since the warmup? */
if (!pag->pagi_init || !pag->pagf_init) {
- xfs_perag_put(pag);
- return -EFSCORRUPTED;
+ error = -EFSCORRUPTED;
+ break;
}
/* Count all the inodes */
@@ -216,10 +211,8 @@ retry:
fsc->fdblocks += pag->pagf_btreeblks;
} else {
error = xchk_fscount_btreeblks(sc, fsc, agno);
- if (error) {
- xfs_perag_put(pag);
+ if (error)
break;
- }
}
/*
@@ -229,12 +222,9 @@ retry:
fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
- xfs_perag_put(pag);
-
- if (xchk_should_terminate(sc, &error))
- break;
}
-
+ if (pag)
+ xfs_perag_put(pag);
if (error)
return error;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 3de59b5c2ce6..2e61df3bca83 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -8,7 +8,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_btree.h"
-#include "xfs_sb.h"
+#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 8d9f3fb0cd22..30e568596b79 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -21,6 +21,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub inode btrees.
@@ -103,7 +104,7 @@ xchk_iallocbt_chunk(
xfs_extlen_t len)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_ag.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
@@ -163,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree(
* the record, compute which fs inode we're talking about.
*/
agino = irec->ir_startino + irec_ino;
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino);
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino);
irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
@@ -213,7 +214,7 @@ xchk_iallocbt_check_cluster(
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_buf *cluster_bp;
unsigned int nr_inodes;
- xfs_agnumber_t agno = bs->cur->bc_ag.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t agbno;
unsigned int cluster_index;
uint16_t cluster_mask = 0;
@@ -423,7 +424,7 @@ xchk_iallocbt_rec(
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
- xfs_agnumber_t agno = bs->cur->bc_ag.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agino_t agino;
xfs_extlen_t len;
int holecount;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 61f90b2c9430..76fbc7ca4cec 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -73,11 +73,25 @@ xchk_inode_extsize(
uint16_t flags)
{
xfs_failaddr_t fa;
+ uint32_t value = be32_to_cpu(dip->di_extsize);
- fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize),
- mode, flags);
+ fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags);
if (fa)
xchk_ino_set_corrupt(sc, ino);
+
+ /*
+ * XFS allows a sysadmin to change the rt extent size when adding a rt
+ * section to a filesystem after formatting. If there are any
+ * directories with extszinherit and rtinherit set, the hint could
+ * become misaligned with the new rextsize. The verifier doesn't check
+ * this, because we allow rtinherit directories even without an rt
+ * device. Flag this as an administrative warning since we will clean
+ * this up eventually.
+ */
+ if ((flags & XFS_DIFLAG_RTINHERIT) &&
+ (flags & XFS_DIFLAG_EXTSZINHERIT) &&
+ value % sc->mp->m_sb.sb_rextsize > 0)
+ xchk_ino_set_warning(sc, ino);
}
/*
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 744530a66c0c..7014b7408bad 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -13,6 +13,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub reference count btrees.
@@ -333,7 +334,7 @@ xchk_refcountbt_rec(
{
struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agblock_t *cow_blocks = bs->private;
- xfs_agnumber_t agno = bs->cur->bc_ag.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t bno;
xfs_extlen_t len;
xfs_nlink_t refcount;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index b8202dd08939..ebb0e245aa72 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -22,6 +22,7 @@
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_quota.h"
#include "scrub/scrub.h"
@@ -303,7 +304,7 @@ xrep_alloc_ag_block(
return error;
if (bno == NULLAGBLOCK)
return -ENOSPC;
- xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
+ xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno,
1, false);
*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
if (resv == XFS_AG_RESV_RMAPBT)
@@ -508,7 +509,7 @@ xrep_put_freelist(
* create an rmap for the block prior to merging it or else other
* parts will break.
*/
- error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
&XFS_RMAP_OINFO_AG);
if (error)
return error;
@@ -518,7 +519,7 @@ xrep_put_freelist(
agbno, 0);
if (error)
return error;
- xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+ xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
@@ -554,7 +555,7 @@ xrep_reap_block(
} else {
agf_bp = sc->sa.agf_bp;
}
- cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
/* Can we find any other rmappings? */
error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
@@ -576,7 +577,8 @@ xrep_reap_block(
* to run xfs_repair.
*/
if (has_other_rmap)
- error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+ error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
+ 1, oinfo);
else if (resv == XFS_AG_RESV_AGFL)
error = xrep_put_freelist(sc, agbno);
else
@@ -891,7 +893,7 @@ xrep_find_ag_btree_roots(
fab->height = 0;
}
- cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index a4f17477c5d1..fc306573f0ac 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -15,6 +15,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -91,7 +92,7 @@ xchk_rmapbt_rec(
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
- xfs_agnumber_t agno = bs->cur->bc_ag.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
bool non_inode;
bool is_unwritten;
bool is_bmbt;
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 2c6c248be823..03882a605a3c 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,6 +13,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "scrub/scrub.h"
+#include "xfs_ag.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
@@ -26,7 +27,7 @@ xchk_btree_cur_fsbno(
cur->bc_flags & XFS_BTREE_LONG_PTRS)
return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0);
+ return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0);
return NULLFSBLOCK;
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index bfad669e6b2f..aaa7e66c42d7 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -15,10 +15,10 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_attr.h"
#include "xfs_attr_remote.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
-#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index 17f36db2f792..667e297f59b1 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -9,6 +9,41 @@ static inline unsigned int bio_max_vecs(unsigned int count)
return bio_max_segs(howmany(count, PAGE_SIZE));
}
+static void
+xfs_flush_bdev_async_endio(
+ struct bio *bio)
+{
+ complete(bio->bi_private);
+}
+
+/*
+ * Submit a request for an async cache flush to run. If the request queue does
+ * not require flush operations, just skip it altogether. If the caller needs
+ * to wait for the flush completion at a later point in time, they must supply a
+ * valid completion. This will be signalled when the flush completes. The
+ * caller never sees the bio that is issued here.
+ */
+void
+xfs_flush_bdev_async(
+ struct bio *bio,
+ struct block_device *bdev,
+ struct completion *done)
+{
+ struct request_queue *q = bdev->bd_disk->queue;
+
+ if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+ complete(done);
+ return;
+ }
+
+ bio_init(bio, NULL, 0);
+ bio_set_dev(bio, bdev);
+ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ bio->bi_private = done;
+ bio->bi_end_io = xfs_flush_bdev_async_endio;
+
+ submit_bio(bio);
+}
int
xfs_rw_bdev(
struct block_device *bdev,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2988efa97e14..213a97a921bb 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -945,7 +945,7 @@ xfs_flush_unmap_range(
xfs_off_t rounding, start, end;
int error;
- rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+ rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
start = round_down(offset, rounding);
end = round_up(offset + len, rounding) - 1;
@@ -1053,9 +1053,9 @@ xfs_prepare_shift(
* extent (after split) during the shift and corrupt the file. Start
* with the block just prior to the start to stabilize the boundary.
*/
- offset = round_down(offset, 1 << mp->m_sb.sb_blocklog);
+ offset = round_down(offset, mp->m_sb.sb_blocksize);
if (offset)
- offset -= (1 << mp->m_sb.sb_blocklog);
+ offset -= mp->m_sb.sb_blocksize;
/*
* Writeback and invalidate cache for the remainder of the file as we're
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 592800c8852f..8ff42b3585e0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -10,7 +10,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
@@ -19,12 +18,10 @@
#include "xfs_buf_item.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_ag.h"
static kmem_zone_t *xfs_buf_zone;
-#define xb_to_gfp(flags) \
- ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
-
/*
* Locking orders
*
@@ -79,7 +76,7 @@ static inline int
xfs_buf_vmap_len(
struct xfs_buf *bp)
{
- return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+ return (bp->b_page_count * PAGE_SIZE);
}
/*
@@ -272,51 +269,30 @@ _xfs_buf_alloc(
return 0;
}
-/*
- * Allocate a page array capable of holding a specified number
- * of pages, and point the page buf at it.
- */
-STATIC int
-_xfs_buf_get_pages(
- struct xfs_buf *bp,
- int page_count)
+static void
+xfs_buf_free_pages(
+ struct xfs_buf *bp)
{
- /* Make sure that we have a page list */
- if (bp->b_pages == NULL) {
- bp->b_page_count = page_count;
- if (page_count <= XB_PAGES) {
- bp->b_pages = bp->b_page_array;
- } else {
- bp->b_pages = kmem_alloc(sizeof(struct page *) *
- page_count, KM_NOFS);
- if (bp->b_pages == NULL)
- return -ENOMEM;
- }
- memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+ uint i;
+
+ ASSERT(bp->b_flags & _XBF_PAGES);
+
+ if (xfs_buf_is_vmapped(bp))
+ vm_unmap_ram(bp->b_addr, bp->b_page_count);
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ if (bp->b_pages[i])
+ __free_page(bp->b_pages[i]);
}
- return 0;
-}
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += bp->b_page_count;
-/*
- * Frees b_pages if it was allocated.
- */
-STATIC void
-_xfs_buf_free_pages(
- struct xfs_buf *bp)
-{
- if (bp->b_pages != bp->b_page_array) {
+ if (bp->b_pages != bp->b_page_array)
kmem_free(bp->b_pages);
- bp->b_pages = NULL;
- }
+ bp->b_pages = NULL;
+ bp->b_flags &= ~_XBF_PAGES;
}
-/*
- * Releases the specified buffer.
- *
- * The modification state of any associated pages is left unchanged.
- * The buffer must not be on any hash - use xfs_buf_rele instead for
- * hashed and refcounted buffers
- */
static void
xfs_buf_free(
struct xfs_buf *bp)
@@ -325,137 +301,103 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
- if (bp->b_flags & _XBF_PAGES) {
- uint i;
-
- if (xfs_buf_is_vmapped(bp))
- vm_unmap_ram(bp->b_addr - bp->b_offset,
- bp->b_page_count);
-
- for (i = 0; i < bp->b_page_count; i++) {
- struct page *page = bp->b_pages[i];
-
- __free_page(page);
- }
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab +=
- bp->b_page_count;
- } else if (bp->b_flags & _XBF_KMEM)
+ if (bp->b_flags & _XBF_PAGES)
+ xfs_buf_free_pages(bp);
+ else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
- _xfs_buf_free_pages(bp);
+
xfs_buf_free_maps(bp);
kmem_cache_free(xfs_buf_zone, bp);
}
-/*
- * Allocates all the pages for buffer in question and builds it's page list.
- */
-STATIC int
-xfs_buf_allocate_memory(
- struct xfs_buf *bp,
- uint flags)
+static int
+xfs_buf_alloc_kmem(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
{
- size_t size;
- size_t nbytes, offset;
- gfp_t gfp_mask = xb_to_gfp(flags);
- unsigned short page_count, i;
- xfs_off_t start, end;
- int error;
- xfs_km_flags_t kmflag_mask = 0;
+ int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+ xfs_km_flags_t kmflag_mask = KM_NOFS;
+ size_t size = BBTOB(bp->b_length);
- /*
- * assure zeroed buffer for non-read cases.
- */
- if (!(flags & XBF_READ)) {
+ /* Assure zeroed buffer for non-read cases. */
+ if (!(flags & XBF_READ))
kmflag_mask |= KM_ZERO;
- gfp_mask |= __GFP_ZERO;
- }
- /*
- * for buffers that are contained within a single page, just allocate
- * the memory from the heap - there's no need for the complexity of
- * page arrays to keep allocation down to order 0.
- */
- size = BBTOB(bp->b_length);
- if (size < PAGE_SIZE) {
- int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
- bp->b_addr = kmem_alloc_io(size, align_mask,
- KM_NOFS | kmflag_mask);
- if (!bp->b_addr) {
- /* low memory - use alloc_page loop instead */
- goto use_alloc_page;
- }
+ bp->b_addr = kmem_alloc_io(size, align_mask, kmflag_mask);
+ if (!bp->b_addr)
+ return -ENOMEM;
- if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
- ((unsigned long)bp->b_addr & PAGE_MASK)) {
- /* b_addr spans two pages - use alloc_page instead */
- kmem_free(bp->b_addr);
- bp->b_addr = NULL;
- goto use_alloc_page;
- }
- bp->b_offset = offset_in_page(bp->b_addr);
- bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = kmem_to_page(bp->b_addr);
- bp->b_page_count = 1;
- bp->b_flags |= _XBF_KMEM;
- return 0;
+ if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
+ ((unsigned long)bp->b_addr & PAGE_MASK)) {
+ /* b_addr spans two pages - use alloc_page instead */
+ kmem_free(bp->b_addr);
+ bp->b_addr = NULL;
+ return -ENOMEM;
}
+ bp->b_offset = offset_in_page(bp->b_addr);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = kmem_to_page(bp->b_addr);
+ bp->b_page_count = 1;
+ bp->b_flags |= _XBF_KMEM;
+ return 0;
+}
-use_alloc_page:
- start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
- end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
- page_count = end - start;
- error = _xfs_buf_get_pages(bp, page_count);
- if (unlikely(error))
- return error;
+static int
+xfs_buf_alloc_pages(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ gfp_t gfp_mask = __GFP_NOWARN;
+ long filled = 0;
- offset = bp->b_offset;
+ if (flags & XBF_READ_AHEAD)
+ gfp_mask |= __GFP_NORETRY;
+ else
+ gfp_mask |= GFP_NOFS;
+
+ /* Make sure that we have a page list */
+ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+ if (bp->b_page_count <= XB_PAGES) {
+ bp->b_pages = bp->b_page_array;
+ } else {
+ bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
+ gfp_mask);
+ if (!bp->b_pages)
+ return -ENOMEM;
+ }
bp->b_flags |= _XBF_PAGES;
- for (i = 0; i < bp->b_page_count; i++) {
- struct page *page;
- uint retries = 0;
-retry:
- page = alloc_page(gfp_mask);
- if (unlikely(page == NULL)) {
- if (flags & XBF_READ_AHEAD) {
- bp->b_page_count = i;
- error = -ENOMEM;
- goto out_free_pages;
- }
+ /* Assure zeroed buffer for non-read cases. */
+ if (!(flags & XBF_READ))
+ gfp_mask |= __GFP_ZERO;
- /*
- * This could deadlock.
- *
- * But until all the XFS lowlevel code is revamped to
- * handle buffer allocation failures we can't do much.
- */
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
- current->comm, current->pid,
- __func__, gfp_mask);
-
- XFS_STATS_INC(bp->b_mount, xb_page_retries);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry;
+ /*
+ * Bulk filling of pages can take multiple calls. Not filling the entire
+ * array is not an allocation failure, so don't back off if we get at
+ * least one extra page.
+ */
+ for (;;) {
+ long last = filled;
+
+ filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
+ bp->b_pages);
+ if (filled == bp->b_page_count) {
+ XFS_STATS_INC(bp->b_mount, xb_page_found);
+ break;
}
- XFS_STATS_INC(bp->b_mount, xb_page_found);
+ if (filled != last)
+ continue;
- nbytes = min_t(size_t, size, PAGE_SIZE - offset);
- size -= nbytes;
- bp->b_pages[i] = page;
- offset = 0;
+ if (flags & XBF_READ_AHEAD) {
+ xfs_buf_free_pages(bp);
+ return -ENOMEM;
+ }
+
+ XFS_STATS_INC(bp->b_mount, xb_page_retries);
+ congestion_wait(BLK_RW_ASYNC, HZ / 50);
}
return 0;
-
-out_free_pages:
- for (i = 0; i < bp->b_page_count; i++)
- __free_page(bp->b_pages[i]);
- bp->b_flags &= ~_XBF_PAGES;
- return error;
}
/*
@@ -469,7 +411,7 @@ _xfs_buf_map_pages(
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
/* A single page buffer is always mappable */
- bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+ bp->b_addr = page_address(bp->b_pages[0]);
} else if (flags & XBF_UNMAPPED) {
bp->b_addr = NULL;
} else {
@@ -496,7 +438,6 @@ _xfs_buf_map_pages(
if (!bp->b_addr)
return -ENOMEM;
- bp->b_addr += bp->b_offset;
}
return 0;
@@ -707,7 +648,7 @@ xfs_buf_get_map(
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
- int error = 0;
+ int error;
*bpp = NULL;
error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
@@ -720,17 +661,22 @@ xfs_buf_get_map(
if (error)
return error;
- error = xfs_buf_allocate_memory(new_bp, flags);
- if (error) {
- xfs_buf_free(new_bp);
- return error;
+ /*
+ * For buffers that fit entirely within a single page, first attempt to
+ * allocate the memory from the heap to minimise memory usage. If we
+ * can't get heap memory for these small buffers, we fall back to using
+ * the page allocator.
+ */
+ if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+ xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ error = xfs_buf_alloc_pages(new_bp, flags);
+ if (error)
+ goto out_free_buf;
}
error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
- if (error) {
- xfs_buf_free(new_bp);
- return error;
- }
+ if (error)
+ goto out_free_buf;
if (bp != new_bp)
xfs_buf_free(new_bp);
@@ -758,6 +704,9 @@ found:
trace_xfs_buf_get(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
+out_free_buf:
+ xfs_buf_free(new_bp);
+ return error;
}
int
@@ -950,8 +899,7 @@ xfs_buf_get_uncached(
int flags,
struct xfs_buf **bpp)
{
- unsigned long page_count;
- int error, i;
+ int error;
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
@@ -960,41 +908,25 @@ xfs_buf_get_uncached(
/* flags might contain irrelevant bits, pass only what we care about */
error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
if (error)
- goto fail;
+ return error;
- page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
- error = _xfs_buf_get_pages(bp, page_count);
+ error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
- for (i = 0; i < page_count; i++) {
- bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
- if (!bp->b_pages[i]) {
- error = -ENOMEM;
- goto fail_free_mem;
- }
- }
- bp->b_flags |= _XBF_PAGES;
-
error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages", __func__);
- goto fail_free_mem;
+ goto fail_free_buf;
}
trace_xfs_buf_get_uncached(bp, _RET_IP_);
*bpp = bp;
return 0;
- fail_free_mem:
- while (--i >= 0)
- __free_page(bp->b_pages[i]);
- _xfs_buf_free_pages(bp);
- fail_free_buf:
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_zone, bp);
- fail:
+fail_free_buf:
+ xfs_buf_free(bp);
return error;
}
@@ -1722,7 +1654,6 @@ xfs_buf_offset(
if (bp->b_addr)
return bp->b_addr + offset;
- offset += bp->b_offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
return page_address(page) + (offset & (PAGE_SIZE-1));
}
@@ -1958,7 +1889,7 @@ xfs_free_buftarg(
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- xfs_blkdev_issue_flush(btp);
+ blkdev_issue_flush(btp->bt_bdev);
kmem_free(btp);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 459ca34f26f5..464dc548fa23 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -167,7 +167,8 @@ struct xfs_buf {
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
- unsigned int b_offset; /* page offset in first page */
+ unsigned int b_offset; /* page offset of b_addr,
+ only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
/*
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index fb69879e4b2b..2828ce45b701 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -74,14 +74,12 @@ xfs_buf_item_straddle(
}
/*
- * This returns the number of log iovecs needed to log the
- * given buf log item.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item segment.
*
- * It calculates this as 1 iovec for the buf log format structure
- * and 1 for each stretch of non-contiguous chunks to be logged.
- * Contiguous chunks are logged in a single iovec.
- *
- * If the XFS_BLI_STALE flag has been set, then log nothing.
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
+ * in a single iovec.
*/
STATIC void
xfs_buf_item_size_segment(
@@ -168,11 +166,8 @@ slow_scan:
}
/*
- * This returns the number of log iovecs needed to log the given buf log item.
- *
- * It calculates this as 1 iovec for the buf log format structure and 1 for each
- * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
- * in a single iovec.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item.
*
* Discontiguous buffers need a format structure per region that is being
* logged. This makes the changes in the buffer appear to log recovery as though
@@ -182,7 +177,11 @@ slow_scan:
* what ends up on disk.
*
* If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
- * format structures.
+ * format structures. If the item has previously been logged and has dirty
+ * regions, we do not relog them in stale buffers. This has the effect of
+ * reducing the size of the relogged item by the amount of dirty data tracked
+ * by the log item. This can result in the committing transaction reducing the
+ * amount of space being consumed by the CIL.
*/
STATIC void
xfs_buf_item_size(
@@ -199,9 +198,9 @@ xfs_buf_item_size(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
if (bip->bli_flags & XFS_BLI_STALE) {
/*
- * The buffer is stale, so all we need to log
- * is the buf log format structure with the
- * cancel flag in it.
+ * The buffer is stale, so all we need to log is the buf log
+ * format structure with the cancel flag in it as we are never
+ * going to replay the changes tracked in the log item.
*/
trace_xfs_buf_item_size_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
@@ -216,9 +215,9 @@ xfs_buf_item_size(
if (bip->bli_flags & XFS_BLI_ORDERED) {
/*
- * The buffer has been logged just to order it.
- * It is not being included in the transaction
- * commit, so no vectors are used at all.
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so no vectors are used at
+ * all.
*/
trace_xfs_buf_item_size_ordered(bip);
*nvecs = XFS_LOG_VEC_ORDERED;
@@ -475,17 +474,8 @@ xfs_buf_item_pin(
}
/*
- * This is called to unpin the buffer associated with the buf log
- * item which was previously pinned with a call to xfs_buf_item_pin().
- *
- * Also drop the reference to the buf item for the current transaction.
- * If the XFS_BLI_STALE flag is set and we are the last reference,
- * then free up the buf log item and unlock the buffer.
- *
- * If the remove flag is set we are called from uncommit in the
- * forced-shutdown path. If that is true and the reference count on
- * the log item is going to drop to zero we need to free the item's
- * descriptor in the transaction.
+ * This is called to unpin the buffer associated with the buf log item which
+ * was previously pinned with a call to xfs_buf_item_pin().
*/
STATIC void
xfs_buf_item_unpin(
@@ -502,38 +492,35 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin(bip);
+ /*
+ * Drop the bli ref associated with the pin and grab the hold required
+ * for the I/O simulation failure in the abort case. We have to do this
+ * before the pin count drops because the AIL doesn't acquire a bli
+ * reference. Therefore if the refcount drops to zero, the bli could
+ * still be AIL resident and the buffer submitted for I/O (and freed on
+ * completion) at any point before we return. This can be removed once
+ * the AIL properly holds a reference on the bli.
+ */
freed = atomic_dec_and_test(&bip->bli_refcount);
-
+ if (freed && !stale && remove)
+ xfs_buf_hold(bp);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
- if (freed && stale) {
+ /* nothing to do but drop the pin count if the bli is active */
+ if (!freed)
+ return;
+
+ if (stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
ASSERT(bp->b_flags & XBF_STALE);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!bp->b_transp);
trace_xfs_buf_item_unpin_stale(bip);
- if (remove) {
- /*
- * If we are in a transaction context, we have to
- * remove the log item from the transaction as we are
- * about to release our reference to the buffer. If we
- * don't, the unlock that occurs later in
- * xfs_trans_uncommit() will try to reference the
- * buffer which we no longer have a hold on.
- */
- if (!list_empty(&lip->li_trans))
- xfs_trans_del_item(lip);
-
- /*
- * Since the transaction no longer refers to the buffer,
- * the buffer should no longer refer to the transaction.
- */
- bp->b_transp = NULL;
- }
-
/*
* If we get called here because of an IO error, we may or may
* not have the item on the AIL. xfs_trans_ail_delete() will
@@ -550,13 +537,13 @@ xfs_buf_item_unpin(
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
- } else if (freed && remove) {
+ } else if (remove) {
/*
* The buffer must be locked and held by the caller to simulate
- * an async I/O failure.
+ * an async I/O failure. We acquired the hold for this case
+ * before the buffer was unpinned.
*/
xfs_buf_lock(bp);
- xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
}
@@ -714,7 +701,7 @@ xfs_buf_item_release(
STATIC void
xfs_buf_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
return xfs_buf_item_release(lip);
}
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d44e8b4a3391..4775485b4062 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -698,7 +698,8 @@ xlog_recover_do_inode_buffer(
static xfs_lsn_t
xlog_recover_get_buf_lsn(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
{
uint32_t magic32;
uint16_t magic16;
@@ -706,11 +707,20 @@ xlog_recover_get_buf_lsn(
void *blk = bp->b_addr;
uuid_t *uuid;
xfs_lsn_t lsn = -1;
+ uint16_t blft;
/* v4 filesystems always recover immediately */
if (!xfs_sb_version_hascrc(&mp->m_sb))
goto recover_immediately;
+ /*
+ * realtime bitmap and summary file blocks do not have magic numbers or
+ * UUIDs, so we must recover them immediately.
+ */
+ blft = xfs_blft_from_flags(buf_f);
+ if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
+ goto recover_immediately;
+
magic32 = be32_to_cpu(*(__be32 *)blk);
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
@@ -796,6 +806,7 @@ xlog_recover_get_buf_lsn(
switch (magicda) {
case XFS_DIR3_LEAF1_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
case XFS_DA3_NODE_MAGIC:
lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
@@ -919,7 +930,7 @@ xlog_recover_buf_commit_pass2(
* the verifier will be reset to match whatever recover turns that
* buffer into.
*/
- lsn = xlog_recover_get_buf_lsn(mp, bp);
+ lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f);
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
trace_xfs_log_recover_buf_skip(log, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index f979d0d7e6cd..736df5660f1f 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -8,7 +8,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
@@ -18,6 +17,7 @@
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
STATIC int
xfs_trim_extents(
@@ -50,7 +50,7 @@ xfs_trim_extents(
goto out_put_perag;
agf = agbp->b_addr;
- cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
/*
* Look up the longest btree in the AGF and start with it.
@@ -108,7 +108,7 @@ xfs_trim_extents(
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
*/
- if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
+ if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
trace_xfs_discard_busy(mp, agno, fbno, flen);
goto next_extent;
}
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 8c1fdf37ee8f..8ed47b739b6c 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -188,7 +188,7 @@ xfs_qm_dquot_logitem_release(
STATIC void
xfs_qm_dquot_logitem_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
return xfs_qm_dquot_logitem_release(lip);
}
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index a4075685d9eb..ad22a003f959 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -11,39 +11,37 @@
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
void
xfs_extent_busy_insert(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
unsigned int flags)
{
struct xfs_extent_busy *new;
struct xfs_extent_busy *busyp;
- struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent = NULL;
new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
- new->agno = agno;
+ new->agno = pag->pag_agno;
new->bno = bno;
new->length = len;
INIT_LIST_HEAD(&new->list);
new->flags = flags;
/* trace before insert to be able to see failed inserts */
- trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
+ trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
- pag = xfs_perag_get(tp->t_mountp, new->agno);
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
while (*rbp) {
@@ -66,7 +64,6 @@ xfs_extent_busy_insert(
list_add(&new->list, &tp->t_busy);
spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
}
/*
@@ -81,21 +78,17 @@ xfs_extent_busy_insert(
int
xfs_extent_busy_search(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len)
{
- struct xfs_perag *pag;
struct rb_node *rbp;
struct xfs_extent_busy *busyp;
int match = 0;
- pag = xfs_perag_get(mp, agno);
+ /* find closest start bno overlap */
spin_lock(&pag->pagb_lock);
-
rbp = pag->pagb_tree.rb_node;
-
- /* find closest start bno overlap */
while (rbp) {
busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
if (bno < busyp->bno) {
@@ -115,7 +108,6 @@ xfs_extent_busy_search(
}
}
spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
return match;
}
@@ -281,17 +273,14 @@ out_force_log:
void
xfs_extent_busy_reuse(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t fbno,
xfs_extlen_t flen,
bool userdata)
{
- struct xfs_perag *pag;
struct rb_node *rbp;
ASSERT(flen > 0);
-
- pag = xfs_perag_get(mp, agno);
spin_lock(&pag->pagb_lock);
restart:
rbp = pag->pagb_tree.rb_node;
@@ -314,7 +303,6 @@ restart:
goto restart;
}
spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
}
/*
@@ -605,12 +593,11 @@ void
xfs_extent_busy_wait_all(
struct xfs_mount *mp)
{
+ struct xfs_perag *pag;
DEFINE_WAIT (wait);
xfs_agnumber_t agno;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- struct xfs_perag *pag = xfs_perag_get(mp, agno);
-
+ for_each_perag(mp, agno, pag) {
do {
prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
if (RB_EMPTY_ROOT(&pag->pagb_tree))
@@ -618,8 +605,6 @@ xfs_extent_busy_wait_all(
schedule();
} while (1);
finish_wait(&pag->pagb_wait, &wait);
-
- xfs_perag_put(pag);
}
}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 8aea07100092..4a118131059f 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -9,6 +9,7 @@
#define __XFS_EXTENT_BUSY_H__
struct xfs_mount;
+struct xfs_perag;
struct xfs_trans;
struct xfs_alloc_arg;
@@ -31,7 +32,7 @@ struct xfs_extent_busy {
};
void
-xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
@@ -39,11 +40,11 @@ xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
bool do_discard);
int
-xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len);
void
-xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag,
xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
bool
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3c0749ab9e40..cc3cfb12df53 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -119,8 +119,8 @@ xfs_dir_fsync(
return xfs_log_force_inode(ip);
}
-static xfs_lsn_t
-xfs_fsync_lsn(
+static xfs_csn_t
+xfs_fsync_seq(
struct xfs_inode *ip,
bool datasync)
{
@@ -128,7 +128,7 @@ xfs_fsync_lsn(
return 0;
if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
return 0;
- return ip->i_itemp->ili_last_lsn;
+ return ip->i_itemp->ili_commit_seq;
}
/*
@@ -151,12 +151,12 @@ xfs_fsync_flush_log(
int *log_flushed)
{
int error = 0;
- xfs_lsn_t lsn;
+ xfs_csn_t seq;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- lsn = xfs_fsync_lsn(ip, datasync);
- if (lsn) {
- error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
+ seq = xfs_fsync_seq(ip, datasync);
+ if (seq) {
+ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
log_flushed);
spin_lock(&ip->i_itemp->ili_lock);
@@ -197,9 +197,9 @@ xfs_file_fsync(
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
* Any inode that has dirty modifications in the log is pinned. The
@@ -219,7 +219,7 @@ xfs_file_fsync(
*/
if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
mp->m_logdev_targp == mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
return error;
}
@@ -384,21 +384,30 @@ restart:
}
goto restart;
}
+
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the
- * iolock shared, we need to update it to exclusive which implies
- * having to redo all checks before.
+ * write. If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies having to
+ * redo all checks before.
+ *
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while we
+ * do this check until we have placed an IO barrier (i.e. hold the
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have the
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
*
- * We need to serialise against EOF updates that occur in IO
- * completions here. We want to make sure that nobody is changing the
- * size while we do this check until we have placed an IO barrier (i.e.
- * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
- * The spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
- * and hence be able to correctly determine if we need to run zeroing.
+ * We can do an unlocked check here safely as IO completion can only
+ * extend EOF. Truncate is locked out at this point, so the EOF can
+ * not move backwards, only forwards. Hence we only need to take the
+ * slow path and spin locks when we are at or beyond the current EOF.
*/
+ if (iocb->ki_pos <= i_size_read(inode))
+ goto out;
+
spin_lock(&ip->i_flags_lock);
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
@@ -426,7 +435,7 @@ restart:
drained_dio = true;
goto restart;
}
-
+
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
NULL, &xfs_buffered_write_iomap_ops);
@@ -435,6 +444,7 @@ restart:
} else
spin_unlock(&ip->i_flags_lock);
+out:
return file_modified(file);
}
@@ -500,7 +510,17 @@ xfs_dio_write_end_io(
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
+ *
+ * As IO completion only ever extends EOF, we can do an unlocked check
+ * here to avoid taking the spinlock. If we land within the current EOF,
+ * then we do not need to do an extending update at all, and we don't
+ * need to take the lock to check this. If we race with an update moving
+ * EOF, then we'll either still be beyond EOF and need to take the lock,
+ * or we'll be within EOF and we don't need to take it at all.
*/
+ if (offset + size <= i_size_read(inode))
+ goto out;
+
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode)) {
i_size_write(inode, offset + size);
@@ -749,18 +769,18 @@ write_retry:
*/
if (ret == -EDQUOT && !cleared_space) {
xfs_iunlock(ip, iolock);
- xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
+ xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
cleared_space = true;
goto write_retry;
} else if (ret == -ENOSPC && !cleared_space) {
- struct xfs_eofblocks eofb = {0};
+ struct xfs_icwalk icw = {0};
cleared_space = true;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
- eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- xfs_blockgc_free_space(ip->i_mount, &eofb);
+ icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
+ xfs_blockgc_free_space(ip->i_mount, &icw);
goto write_retry;
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index db23e455eb91..eed6ca5f8f91 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -9,13 +9,13 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
#include "xfs_trace.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trans.h"
#include "xfs_filestream.h"
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 34f2b971ce43..7d0b09c1366e 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -24,6 +24,7 @@
#include "xfs_refcount_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
/* Convert an xfs_fsmap to an fsmap. */
static void
@@ -157,10 +158,10 @@ struct xfs_getfsmap_info {
struct xfs_fsmap_head *head;
struct fsmap *fsmap_recs; /* mapping records */
struct xfs_buf *agf_bp; /* AGF, for refcount queries */
+ struct xfs_perag *pag; /* AG info, if applicable */
xfs_daddr_t next_daddr; /* next daddr we expect */
u64 missing_owner; /* owner of holes */
u32 dev; /* device id */
- xfs_agnumber_t agno; /* AG number, if applicable */
struct xfs_rmap_irec low; /* low rmap key */
struct xfs_rmap_irec high; /* high rmap key */
bool last; /* last extent? */
@@ -203,14 +204,13 @@ xfs_getfsmap_is_shared(
*stat = false;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
- /* rt files will have agno set to NULLAGNUMBER */
- if (info->agno == NULLAGNUMBER)
+ /* rt files will have no perag structure */
+ if (!info->pag)
return 0;
/* Are there any shared blocks here? */
flen = 0;
- cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
- info->agno);
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag);
error = xfs_refcount_find_shared(cur, rec->rm_startblock,
rec->rm_blockcount, &fbno, &flen, false);
@@ -311,7 +311,8 @@ xfs_getfsmap_helper(
if (info->head->fmh_entries >= info->head->fmh_count)
return -ECANCELED;
- trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
+ trace_xfs_fsmap_mapping(mp, info->dev,
+ info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec);
fmr.fmr_device = info->dev;
fmr.fmr_physical = rec_daddr;
@@ -354,7 +355,7 @@ xfs_getfsmap_datadev_helper(
xfs_fsblock_t fsb;
xfs_daddr_t rec_daddr;
- fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock);
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock);
rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
@@ -372,7 +373,7 @@ xfs_getfsmap_datadev_bnobt_helper(
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
- rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno,
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno,
rec->ar_startblock);
irec.rm_startblock = rec->ar_startblock;
@@ -429,8 +430,8 @@ xfs_getfsmap_logdev(
info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
info->missing_owner = XFS_FMR_OWN_FREE;
- trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+ trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
if (keys[0].fmr_physical > 0)
return 0;
@@ -508,8 +509,8 @@ __xfs_getfsmap_rtdev(
info->high.rm_blockcount = 0;
xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
- trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+ trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
return query_fn(tp, info);
}
@@ -572,6 +573,7 @@ __xfs_getfsmap_datadev(
void *priv)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
struct xfs_btree_cur *bt_cur = NULL;
xfs_fsblock_t start_fsb;
xfs_fsblock_t end_fsb;
@@ -610,20 +612,20 @@ __xfs_getfsmap_datadev(
start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
- /* Query each AG */
- for (info->agno = start_ag; info->agno <= end_ag; info->agno++) {
+ for_each_perag_range(mp, start_ag, end_ag, pag) {
/*
* Set the AG high key from the fsmap high key if this
* is the last AG that we're querying.
*/
- if (info->agno == end_ag) {
+ info->pag = pag;
+ if (pag->pag_agno == end_ag) {
info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
end_fsb);
info->high.rm_offset = XFS_BB_TO_FSBT(mp,
keys[1].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
if (error)
- goto err;
+ break;
xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
}
@@ -634,38 +636,45 @@ __xfs_getfsmap_datadev(
info->agf_bp = NULL;
}
- error = xfs_alloc_read_agf(mp, tp, info->agno, 0,
+ error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0,
&info->agf_bp);
if (error)
- goto err;
+ break;
- trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, info->agno,
+ trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno,
+ &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno,
&info->high);
error = query_fn(tp, info, &bt_cur, priv);
if (error)
- goto err;
+ break;
/*
* Set the AG low key to the start of the AG prior to
* moving on to the next AG.
*/
- if (info->agno == start_ag) {
+ if (pag->pag_agno == start_ag) {
info->low.rm_startblock = 0;
info->low.rm_owner = 0;
info->low.rm_offset = 0;
info->low.rm_flags = 0;
}
- }
- /* Report any gap at the end of the AG */
- info->last = true;
- error = query_fn(tp, info, &bt_cur, priv);
- if (error)
- goto err;
+ /*
+ * If this is the last AG, report any gap at the end of it
+ * before we drop the reference to the perag when the loop
+ * terminates.
+ */
+ if (pag->pag_agno == end_ag) {
+ info->last = true;
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ break;
+ }
+ info->pag = NULL;
+ }
-err:
if (bt_cur)
xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
XFS_BTREE_NOERROR);
@@ -673,6 +682,13 @@ err:
xfs_trans_brelse(tp, info->agf_bp);
info->agf_bp = NULL;
}
+ if (info->pag) {
+ xfs_perag_put(info->pag);
+ info->pag = NULL;
+ } else if (pag) {
+ /* loop termination case */
+ xfs_perag_put(pag);
+ }
return error;
}
@@ -691,7 +707,7 @@ xfs_getfsmap_datadev_rmapbt_query(
/* Allocate cursor for this AG and query_range it. */
*curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->agno);
+ info->pag);
return xfs_rmap_query_range(*curpp, &info->low, &info->high,
xfs_getfsmap_datadev_helper, info);
}
@@ -724,7 +740,7 @@ xfs_getfsmap_datadev_bnobt_query(
/* Allocate cursor for this AG and query_range it. */
*curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->agno, XFS_BTNUM_BNO);
+ info->pag, XFS_BTNUM_BNO);
key->ar_startblock = info->low.rm_startblock;
key[1].ar_startblock = info->high.rm_startblock;
return xfs_alloc_query_range(*curpp, key, &key[1],
@@ -937,7 +953,7 @@ xfs_getfsmap(
info.dev = handlers[i].dev;
info.last = false;
- info.agno = NULLAGNUMBER;
+ info.pag = NULL;
error = handlers[i].fn(tp, dkeys, &info);
if (error)
break;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index be9cf88d2ad7..6ed29b158312 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -538,25 +538,25 @@ xfs_do_force_shutdown(
if (flags & SHUTDOWN_FORCE_UMOUNT) {
xfs_alert(mp,
-"User initiated shutdown received. Shutting down filesystem");
+"User initiated shutdown (0x%x) received. Shutting down filesystem",
+ flags);
return;
}
- xfs_notice(mp,
-"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
- __func__, flags, lnnum, fname, __return_address);
-
if (flags & SHUTDOWN_CORRUPT_INCORE) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-"Corruption of in-memory data detected. Shutting down filesystem");
+"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem",
+ flags, __return_address, fname, lnnum);
if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
xfs_stack_trace();
} else if (logerror) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
+"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem",
+ flags, __return_address, fname, lnnum);
} else {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
+"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem",
+ flags, __return_address, fname, lnnum);
}
xfs_alert(mp,
@@ -576,10 +576,8 @@ xfs_fs_reserve_ag_blocks(
int err2;
mp->m_finobt_nores = false;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_init(pag, NULL);
- xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
@@ -605,10 +603,8 @@ xfs_fs_unreserve_ag_blocks(
int error = 0;
int err2;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_free(pag);
- xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 8e0cb05a7142..eb10eacabc8f 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -9,11 +9,11 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
#include "xfs_health.h"
+#include "xfs_ag.h"
/*
* Warn about metadata corruption that we detected but haven't fixed, and
@@ -34,14 +34,12 @@ xfs_health_unmount(
return;
/* Measure AG corruption levels. */
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
xfs_ag_measure_sickness(pag, &sick, &checked);
if (sick) {
trace_xfs_ag_unfixed_corruption(mp, agno, sick);
warn = true;
}
- xfs_perag_put(pag);
}
/* Measure realtime volume corruption levels. */
@@ -231,6 +229,15 @@ xfs_inode_mark_sick(
ip->i_sick |= mask;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Keep this inode around so we don't lose the sickness report. Scrub
+ * grabs inodes with DONTCACHE assuming that most inode are ok, which
+ * is not the case here.
+ */
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
}
/* Mark parts of an inode healed. */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3c81daca0e9a..6007683482c6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -23,9 +22,65 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
#include "xfs_ialloc.h"
+#include "xfs_ag.h"
#include <linux/iversion.h>
+/* Radix tree tags for incore inode tree. */
+
+/* inode is to be reclaimed */
+#define XFS_ICI_RECLAIM_TAG 0
+/* Inode has speculative preallocations (posteof or cow) to clean. */
+#define XFS_ICI_BLOCKGC_TAG 1
+
+/*
+ * The goal for walking incore inodes. These can correspond with incore inode
+ * radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
+ */
+enum xfs_icwalk_goal {
+ /* Goals that are not related to tags; these must be < 0. */
+ XFS_ICWALK_DQRELE = -1,
+
+ /* Goals directly associated with tagged inodes. */
+ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
+ XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
+};
+
+#define XFS_ICWALK_NULL_TAG (-1U)
+
+/* Compute the inode radix tree tag for this goal. */
+static inline unsigned int
+xfs_icwalk_tag(enum xfs_icwalk_goal goal)
+{
+ return goal < 0 ? XFS_ICWALK_NULL_TAG : goal;
+}
+
+static int xfs_icwalk(struct xfs_mount *mp,
+ enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
+static int xfs_icwalk_ag(struct xfs_perag *pag,
+ enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
+
+/*
+ * Private inode cache walk flags for struct xfs_icwalk. Must not
+ * coincide with XFS_ICWALK_FLAGS_VALID.
+ */
+#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31)
+#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30)
+#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29)
+
+/* Stop scanning after icw_scan_limit inodes. */
+#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
+
+#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
+#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
+
+#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \
+ XFS_ICWALK_FLAG_DROP_GDQUOT | \
+ XFS_ICWALK_FLAG_DROP_PDQUOT | \
+ XFS_ICWALK_FLAG_SCAN_LIMIT | \
+ XFS_ICWALK_FLAG_RECLAIM_SICK | \
+ XFS_ICWALK_FLAG_UNION)
+
/*
* Allocate and initialise an xfs_inode.
*/
@@ -157,46 +212,94 @@ xfs_reclaim_work_queue(
rcu_read_unlock();
}
-static void
-xfs_perag_set_reclaim_tag(
+/*
+ * Background scanning to trim preallocated space. This is queued based on the
+ * 'speculative_prealloc_lifetime' tunable (5m by default).
+ */
+static inline void
+xfs_blockgc_queue(
struct xfs_perag *pag)
{
+ rcu_read_lock();
+ if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
+ queue_delayed_work(pag->pag_mount->m_gc_workqueue,
+ &pag->pag_blockgc_work,
+ msecs_to_jiffies(xfs_blockgc_secs * 1000));
+ rcu_read_unlock();
+}
+
+/* Set a tag on both the AG incore inode tree and the AG radix tree. */
+static void
+xfs_perag_set_inode_tag(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ unsigned int tag)
+{
struct xfs_mount *mp = pag->pag_mount;
+ bool was_tagged;
lockdep_assert_held(&pag->pag_ici_lock);
- if (pag->pag_ici_reclaimable++)
+
+ was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
+ radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
+
+ if (tag == XFS_ICI_RECLAIM_TAG)
+ pag->pag_ici_reclaimable++;
+
+ if (was_tagged)
return;
- /* propagate the reclaim tag up into the perag radix tree */
+ /* propagate the tag up into the perag radix tree */
spin_lock(&mp->m_perag_lock);
- radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
- XFS_ICI_RECLAIM_TAG);
+ radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
spin_unlock(&mp->m_perag_lock);
- /* schedule periodic background inode reclaim */
- xfs_reclaim_work_queue(mp);
+ /* start background work */
+ switch (tag) {
+ case XFS_ICI_RECLAIM_TAG:
+ xfs_reclaim_work_queue(mp);
+ break;
+ case XFS_ICI_BLOCKGC_TAG:
+ xfs_blockgc_queue(pag);
+ break;
+ }
- trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+ trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
}
+/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
static void
-xfs_perag_clear_reclaim_tag(
- struct xfs_perag *pag)
+xfs_perag_clear_inode_tag(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ unsigned int tag)
{
struct xfs_mount *mp = pag->pag_mount;
lockdep_assert_held(&pag->pag_ici_lock);
- if (--pag->pag_ici_reclaimable)
+
+ /*
+ * Reclaim can signal (with a null agino) that it cleared its own tag
+ * by removing the inode from the radix tree.
+ */
+ if (agino != NULLAGINO)
+ radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
+ else
+ ASSERT(tag == XFS_ICI_RECLAIM_TAG);
+
+ if (tag == XFS_ICI_RECLAIM_TAG)
+ pag->pag_ici_reclaimable--;
+
+ if (radix_tree_tagged(&pag->pag_ici_root, tag))
return;
- /* clear the reclaim tag from the perag radix tree */
+ /* clear the tag from the perag radix tree */
spin_lock(&mp->m_perag_lock);
- radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
- XFS_ICI_RECLAIM_TAG);
+ radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
spin_unlock(&mp->m_perag_lock);
- trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
-}
+ trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
+}
/*
* We set the inode flag atomically with the radix tree tag.
@@ -204,7 +307,7 @@ xfs_perag_clear_reclaim_tag(
* can go away.
*/
void
-xfs_inode_set_reclaim_tag(
+xfs_inode_mark_reclaimable(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
@@ -214,9 +317,8 @@ xfs_inode_set_reclaim_tag(
spin_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- xfs_perag_set_reclaim_tag(pag);
+ xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
@@ -224,18 +326,7 @@ xfs_inode_set_reclaim_tag(
xfs_perag_put(pag);
}
-STATIC void
-xfs_inode_clear_reclaim_tag(
- struct xfs_perag *pag,
- xfs_ino_t ino)
-{
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(pag->pag_mount, ino),
- XFS_ICI_RECLAIM_TAG);
- xfs_perag_clear_reclaim_tag(pag);
-}
-
-static void
+static inline void
xfs_inew_wait(
struct xfs_inode *ip)
{
@@ -264,14 +355,14 @@ xfs_reinit_inode(
struct xfs_mount *mp,
struct inode *inode)
{
- int error;
- uint32_t nlink = inode->i_nlink;
- uint32_t generation = inode->i_generation;
- uint64_t version = inode_peek_iversion(inode);
- umode_t mode = inode->i_mode;
- dev_t dev = inode->i_rdev;
- kuid_t uid = inode->i_uid;
- kgid_t gid = inode->i_gid;
+ int error;
+ uint32_t nlink = inode->i_nlink;
+ uint32_t generation = inode->i_generation;
+ uint64_t version = inode_peek_iversion(inode);
+ umode_t mode = inode->i_mode;
+ dev_t dev = inode->i_rdev;
+ kuid_t uid = inode->i_uid;
+ kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -286,6 +377,74 @@ xfs_reinit_inode(
}
/*
+ * Carefully nudge an inode whose VFS state has been torn down back into a
+ * usable state. Drops the i_flags_lock and the rcu read lock.
+ */
+static int
+xfs_iget_recycle(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip) __releases(&ip->i_flags_lock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ trace_xfs_iget_recycle(ip);
+
+ /*
+ * We need to make it look like the inode is being reclaimed to prevent
+ * the actual reclaim workers from stomping over us while we recycle
+ * the inode. We can't clear the radix tree tag yet as it requires
+ * pag_ici_lock to be held exclusive.
+ */
+ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+ error = xfs_reinit_inode(mp, inode);
+ if (error) {
+ bool wake;
+
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ rcu_read_lock();
+ spin_lock(&ip->i_flags_lock);
+ wake = !!__xfs_iflags_test(ip, XFS_INEW);
+ ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ if (wake)
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
+ ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ trace_xfs_iget_recycle_fail(ip);
+ return error;
+ }
+
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ /*
+ * Clear the per-lifetime state in the inode as we are now effectively
+ * a new inode and need to return to the initial state before reuse
+ * occurs.
+ */
+ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+ ip->i_flags |= XFS_INEW;
+ xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ inode->i_state = I_NEW;
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+
+ return 0;
+}
+
+/*
* If we are allocating a new inode, then check what was returned is
* actually a free, empty inode. If we are not allocating an inode,
* then check we didn't find a free inode.
@@ -348,30 +507,21 @@ xfs_iget_cache_hit(
* will not match, so check for that, too.
*/
spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != ino) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(mp, xs_ig_frecycle);
- error = -EAGAIN;
- goto out_error;
- }
-
+ if (ip->i_ino != ino)
+ goto out_skip;
/*
* If we are racing with another cache hit that is currently
* instantiating this inode or currently recycling it out of
- * reclaimabe state, wait for the initialisation to complete
+ * reclaimable state, wait for the initialisation to complete
* before continuing.
*
* XXX(hch): eventually we should do something equivalent to
* wait_on_inode to wait for these flags to be cleared
* instead of polling for it.
*/
- if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(mp, xs_ig_frecycle);
- error = -EAGAIN;
- goto out_error;
- }
+ if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM))
+ goto out_skip;
/*
* Check the inode free state is valid. This also detects lookup
@@ -381,72 +531,21 @@ xfs_iget_cache_hit(
if (error)
goto out_error;
- /*
- * If IRECLAIMABLE is set, we've torn down the VFS inode already.
- * Need to carefully get it back into useable state.
- */
- if (ip->i_flags & XFS_IRECLAIMABLE) {
- trace_xfs_iget_reclaim(ip);
-
- if (flags & XFS_IGET_INCORE) {
- error = -EAGAIN;
- goto out_error;
- }
-
- /*
- * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
- * from stomping over us while we recycle the inode. We can't
- * clear the radix tree reclaimable tag yet as it requires
- * pag_ici_lock to be held exclusive.
- */
- ip->i_flags |= XFS_IRECLAIM;
-
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
+ /* Skip inodes that have no vfs state. */
+ if ((flags & XFS_IGET_INCORE) &&
+ (ip->i_flags & XFS_IRECLAIMABLE))
+ goto out_skip;
- ASSERT(!rwsem_is_locked(&inode->i_rwsem));
- error = xfs_reinit_inode(mp, inode);
- if (error) {
- bool wake;
- /*
- * Re-initializing the inode failed, and we are in deep
- * trouble. Try to re-add it to the reclaim list.
- */
- rcu_read_lock();
- spin_lock(&ip->i_flags_lock);
- wake = !!__xfs_iflags_test(ip, XFS_INEW);
- ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
- if (wake)
- wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
- ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
- trace_xfs_iget_reclaim_fail(ip);
- goto out_error;
- }
-
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
-
- /*
- * Clear the per-lifetime state in the inode as we are now
- * effectively a new inode and need to return to the initial
- * state before reuse occurs.
- */
- ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
- ip->i_flags |= XFS_INEW;
- xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
- inode->i_state = I_NEW;
- ip->i_sick = 0;
- ip->i_checked = 0;
-
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
+ /* The inode fits the selection criteria; process it. */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ /* Drops i_flags_lock and RCU read lock. */
+ error = xfs_iget_recycle(pag, ip);
+ if (error)
+ return error;
} else {
/* If the VFS inode is being torn down, pause and try again. */
- if (!igrab(inode)) {
- trace_xfs_iget_skip(ip);
- error = -EAGAIN;
- goto out_error;
- }
+ if (!igrab(inode))
+ goto out_skip;
/* We've got a live one. */
spin_unlock(&ip->i_flags_lock);
@@ -463,13 +562,16 @@ xfs_iget_cache_hit(
return 0;
+out_skip:
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
+ error = -EAGAIN;
out_error:
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
return error;
}
-
static int
xfs_iget_cache_miss(
struct xfs_mount *mp,
@@ -715,207 +817,96 @@ xfs_icache_inode_is_allocated(
return 0;
}
-/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH 32
-
-/*
- * Decide if the given @ip is eligible to be a part of the inode walk, and
- * grab it if so. Returns true if it's ready to go or false if we should just
- * ignore it.
- */
-STATIC bool
-xfs_inode_walk_ag_grab(
- struct xfs_inode *ip,
- int flags)
+#ifdef CONFIG_XFS_QUOTA
+/* Decide if we want to grab this inode to drop its dquots. */
+static bool
+xfs_dqrele_igrab(
+ struct xfs_inode *ip)
{
- struct inode *inode = VFS_I(ip);
- bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT);
+ bool ret = false;
ASSERT(rcu_read_lock_held());
/* Check for stale RCU freed inode */
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino)
- goto out_unlock_noent;
-
- /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
- __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
- goto out_unlock_noent;
- spin_unlock(&ip->i_flags_lock);
-
- /* nothing to sync during shutdown */
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return false;
+ goto out_unlock;
- /* If we can't grab the inode, it must on it's way to reclaim. */
- if (!igrab(inode))
- return false;
+ /*
+ * Skip inodes that are anywhere in the reclaim machinery because we
+ * drop dquots before tagging an inode for reclamation.
+ */
+ if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE))
+ goto out_unlock;
- /* inode is valid */
- return true;
+ /*
+ * The inode looks alive; try to grab a VFS reference so that it won't
+ * get destroyed. If we got the reference, return true to say that
+ * we grabbed the inode.
+ *
+ * If we can't get the reference, then we know the inode had its VFS
+ * state torn down and hasn't yet entered the reclaim machinery. Since
+ * we also know that dquots are detached from an inode before it enters
+ * reclaim, we can skip the inode.
+ */
+ ret = igrab(VFS_I(ip)) != NULL;
-out_unlock_noent:
+out_unlock:
spin_unlock(&ip->i_flags_lock);
- return false;
+ return ret;
}
-/*
- * For a given per-AG structure @pag, grab, @execute, and rele all incore
- * inodes with the given radix tree @tag.
- */
-STATIC int
-xfs_inode_walk_ag(
- struct xfs_perag *pag,
- int iter_flags,
- int (*execute)(struct xfs_inode *ip, void *args),
- void *args,
- int tag)
+/* Drop this inode's dquots. */
+static void
+xfs_dqrele_inode(
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
{
- struct xfs_mount *mp = pag->pag_mount;
- uint32_t first_index;
- int last_error = 0;
- int skipped;
- bool done;
- int nr_found;
-
-restart:
- done = false;
- skipped = 0;
- first_index = 0;
- nr_found = 0;
- do {
- struct xfs_inode *batch[XFS_LOOKUP_BATCH];
- int error = 0;
- int i;
-
- rcu_read_lock();
-
- if (tag == XFS_ICI_NO_TAG)
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
- (void **)batch, first_index,
- XFS_LOOKUP_BATCH);
- else
- nr_found = radix_tree_gang_lookup_tag(
- &pag->pag_ici_root,
- (void **) batch, first_index,
- XFS_LOOKUP_BATCH, tag);
-
- if (!nr_found) {
- rcu_read_unlock();
- break;
- }
-
- /*
- * Grab the inodes before we drop the lock. if we found
- * nothing, nr == 0 and the loop will be skipped.
- */
- for (i = 0; i < nr_found; i++) {
- struct xfs_inode *ip = batch[i];
-
- if (done || !xfs_inode_walk_ag_grab(ip, iter_flags))
- batch[i] = NULL;
-
- /*
- * Update the index for the next lookup. Catch
- * overflows into the next AG range which can occur if
- * we have inodes in the last block of the AG and we
- * are currently pointing to the last inode.
- *
- * Because we may see inodes that are from the wrong AG
- * due to RCU freeing and reallocation, only update the
- * index if it lies in this AG. It was a race that lead
- * us to see this inode, so another lookup from the
- * same index will not find it again.
- */
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
- continue;
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = true;
- }
-
- /* unlock now we've grabbed the inodes. */
- rcu_read_unlock();
-
- for (i = 0; i < nr_found; i++) {
- if (!batch[i])
- continue;
- if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) &&
- xfs_iflags_test(batch[i], XFS_INEW))
- xfs_inew_wait(batch[i]);
- error = execute(batch[i], args);
- xfs_irele(batch[i]);
- if (error == -EAGAIN) {
- skipped++;
- continue;
- }
- if (error && last_error != -EFSCORRUPTED)
- last_error = error;
- }
-
- /* bail out if the filesystem is corrupted. */
- if (error == -EFSCORRUPTED)
- break;
+ if (xfs_iflags_test(ip, XFS_INEW))
+ xfs_inew_wait(ip);
- cond_resched();
-
- } while (nr_found && !done);
-
- if (skipped) {
- delay(1);
- goto restart;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) {
+ xfs_qm_dqrele(ip->i_udquot);
+ ip->i_udquot = NULL;
}
- return last_error;
-}
-
-/* Fetch the next (possibly tagged) per-AG structure. */
-static inline struct xfs_perag *
-xfs_inode_walk_get_perag(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- int tag)
-{
- if (tag == XFS_ICI_NO_TAG)
- return xfs_perag_get(mp, agno);
- return xfs_perag_get_tag(mp, agno, tag);
+ if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) {
+ xfs_qm_dqrele(ip->i_gdquot);
+ ip->i_gdquot = NULL;
+ }
+ if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
}
/*
- * Call the @execute function on all incore inodes matching the radix tree
- * @tag.
+ * Detach all dquots from incore inodes if we can. The caller must already
+ * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will
+ * not get reattached.
*/
int
-xfs_inode_walk(
+xfs_dqrele_all_inodes(
struct xfs_mount *mp,
- int iter_flags,
- int (*execute)(struct xfs_inode *ip, void *args),
- void *args,
- int tag)
+ unsigned int qflags)
{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
+ struct xfs_icwalk icw = { .icw_flags = 0 };
- ag = 0;
- while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
+ if (qflags & XFS_UQUOTA_ACCT)
+ icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT;
+ if (qflags & XFS_GQUOTA_ACCT)
+ icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT;
+ if (qflags & XFS_PQUOTA_ACCT)
+ icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
+
+ return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw);
}
+#else
+# define xfs_dqrele_igrab(ip) (false)
+# define xfs_dqrele_inode(ip, priv) ((void)0)
+#endif /* CONFIG_XFS_QUOTA */
/*
* Grab the inode for reclaim exclusively.
@@ -935,8 +926,9 @@ xfs_inode_walk(
* Return true if we grabbed it, false otherwise.
*/
static bool
-xfs_reclaim_inode_grab(
- struct xfs_inode *ip)
+xfs_reclaim_igrab(
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
{
ASSERT(rcu_read_lock_held());
@@ -947,6 +939,14 @@ xfs_reclaim_inode_grab(
spin_unlock(&ip->i_flags_lock);
return false;
}
+
+ /* Don't reclaim a sick inode unless the caller asked for it. */
+ if (ip->i_sick &&
+ (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
+ spin_unlock(&ip->i_flags_lock);
+ return false;
+ }
+
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
return true;
@@ -1002,6 +1002,8 @@ reclaim:
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
+ ip->i_sick = 0;
+ ip->i_checked = 0;
spin_unlock(&ip->i_flags_lock);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1018,7 +1020,7 @@ reclaim:
if (!radix_tree_delete(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ino)))
ASSERT(0);
- xfs_perag_clear_reclaim_tag(pag);
+ xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
spin_unlock(&pag->pag_ici_lock);
/*
@@ -1030,7 +1032,7 @@ reclaim:
* unlocked after the lookup before we go ahead and free it.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_qm_dqdetach(ip);
+ ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(xfs_inode_clean(ip));
@@ -1045,108 +1047,30 @@ out:
xfs_iflags_clear(ip, XFS_IRECLAIM);
}
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- *
- * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
- * so that callers that want to block until all dirty inodes are written back
- * and reclaimed can sanely loop.
- */
-static void
-xfs_reclaim_inodes_ag(
- struct xfs_mount *mp,
- int *nr_to_scan)
+/* Reclaim sick inodes if we're unmounting or the fs went down. */
+static inline bool
+xfs_want_reclaim_sick(
+ struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t ag = 0;
-
- while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
- unsigned long first_index = 0;
- int done = 0;
- int nr_found = 0;
-
- ag = pag->pag_agno + 1;
-
- first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
- do {
- struct xfs_inode *batch[XFS_LOOKUP_BATCH];
- int i;
-
- rcu_read_lock();
- nr_found = radix_tree_gang_lookup_tag(
- &pag->pag_ici_root,
- (void **)batch, first_index,
- XFS_LOOKUP_BATCH,
- XFS_ICI_RECLAIM_TAG);
- if (!nr_found) {
- done = 1;
- rcu_read_unlock();
- break;
- }
-
- /*
- * Grab the inodes before we drop the lock. if we found
- * nothing, nr == 0 and the loop will be skipped.
- */
- for (i = 0; i < nr_found; i++) {
- struct xfs_inode *ip = batch[i];
-
- if (done || !xfs_reclaim_inode_grab(ip))
- batch[i] = NULL;
-
- /*
- * Update the index for the next lookup. Catch
- * overflows into the next AG range which can
- * occur if we have inodes in the last block of
- * the AG and we are currently pointing to the
- * last inode.
- *
- * Because we may see inodes that are from the
- * wrong AG due to RCU freeing and
- * reallocation, only update the index if it
- * lies in this AG. It was a race that lead us
- * to see this inode, so another lookup from
- * the same index will not find it again.
- */
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
- pag->pag_agno)
- continue;
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
- }
-
- /* unlock now we've grabbed the inodes. */
- rcu_read_unlock();
-
- for (i = 0; i < nr_found; i++) {
- if (batch[i])
- xfs_reclaim_inode(batch[i], pag);
- }
-
- *nr_to_scan -= XFS_LOOKUP_BATCH;
- cond_resched();
- } while (nr_found && !done && *nr_to_scan > 0);
-
- if (done)
- first_index = 0;
- WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
- xfs_perag_put(pag);
- }
+ return (mp->m_flags & XFS_MOUNT_UNMOUNTING) ||
+ (mp->m_flags & XFS_MOUNT_NORECOVERY) ||
+ XFS_FORCED_SHUTDOWN(mp);
}
void
xfs_reclaim_inodes(
struct xfs_mount *mp)
{
- int nr_to_scan = INT_MAX;
+ struct xfs_icwalk icw = {
+ .icw_flags = 0,
+ };
+
+ if (xfs_want_reclaim_sick(mp))
+ icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
xfs_ail_push_all_sync(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
}
}
@@ -1160,13 +1084,21 @@ xfs_reclaim_inodes(
long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
- int nr_to_scan)
+ unsigned long nr_to_scan)
{
+ struct xfs_icwalk icw = {
+ .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
+ .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
+ };
+
+ if (xfs_want_reclaim_sick(mp))
+ icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
+
/* kick background reclaimer and push the AIL */
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
return 0;
}
@@ -1174,13 +1106,13 @@ xfs_reclaim_inodes_nr(
* Return the number of reclaimable inodes in the filesystem for
* the shrinker to determine how much to reclaim.
*/
-int
+long
xfs_reclaim_inodes_count(
struct xfs_mount *mp)
{
struct xfs_perag *pag;
xfs_agnumber_t ag = 0;
- int reclaimable = 0;
+ long reclaimable = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
ag = pag->pag_agno + 1;
@@ -1191,20 +1123,20 @@ xfs_reclaim_inodes_count(
}
STATIC bool
-xfs_inode_match_id(
+xfs_icwalk_match_id(
struct xfs_inode *ip,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
- if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
- !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
+ !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
return false;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
- !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
+ !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
return false;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- ip->i_projid != eofb->eof_prid)
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
+ ip->i_projid != icw->icw_prid)
return false;
return true;
@@ -1215,20 +1147,20 @@ xfs_inode_match_id(
* criteria match. This is for global/internal scans only.
*/
STATIC bool
-xfs_inode_match_id_union(
+xfs_icwalk_match_id_union(
struct xfs_inode *ip,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
- if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
- uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
+ uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
return true;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
- gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
+ gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
return true;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- ip->i_projid == eofb->eof_prid)
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
+ ip->i_projid == icw->icw_prid)
return true;
return false;
@@ -1236,29 +1168,29 @@ xfs_inode_match_id_union(
/*
* Is this inode @ip eligible for eof/cow block reclamation, given some
- * filtering parameters @eofb? The inode is eligible if @eofb is null or
+ * filtering parameters @icw? The inode is eligible if @icw is null or
* if the predicate functions match.
*/
static bool
-xfs_inode_matches_eofb(
+xfs_icwalk_match(
struct xfs_inode *ip,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
bool match;
- if (!eofb)
+ if (!icw)
return true;
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
+ if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
+ match = xfs_icwalk_match_id_union(ip, icw);
else
- match = xfs_inode_match_id(ip, eofb);
+ match = xfs_icwalk_match_id(ip, icw);
if (!match)
return false;
/* skip the inode if the file size is too small */
- if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
+ XFS_ISIZE(ip) < icw->icw_min_file_size)
return false;
return true;
@@ -1276,22 +1208,20 @@ xfs_reclaim_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_reclaim_work);
- int nr_to_scan = INT_MAX;
- xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
xfs_reclaim_work_queue(mp);
}
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- void *args,
+ struct xfs_icwalk *icw,
unsigned int *lockflags)
{
- struct xfs_eofblocks *eofb = args;
bool wait;
- wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+ wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
return 0;
@@ -1303,7 +1233,7 @@ xfs_inode_free_eofblocks(
if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
return 0;
- if (!xfs_inode_matches_eofb(ip, eofb))
+ if (!xfs_icwalk_match(ip, icw))
return 0;
/*
@@ -1326,22 +1256,6 @@ xfs_inode_free_eofblocks(
return 0;
}
-/*
- * Background scanning to trim preallocated space. This is queued based on the
- * 'speculative_prealloc_lifetime' tunable (5m by default).
- */
-static inline void
-xfs_blockgc_queue(
- struct xfs_perag *pag)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
- queue_delayed_work(pag->pag_mount->m_gc_workqueue,
- &pag->pag_blockgc_work,
- msecs_to_jiffies(xfs_blockgc_secs * 1000));
- rcu_read_unlock();
-}
-
static void
xfs_blockgc_set_iflag(
struct xfs_inode *ip,
@@ -1349,7 +1263,6 @@ xfs_blockgc_set_iflag(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
- int tagged;
ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
@@ -1366,24 +1279,8 @@ xfs_blockgc_set_iflag(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_BLOCKGC_TAG);
- if (!tagged) {
- /* propagate the blockgc tag up into the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_set(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_BLOCKGC_TAG);
- spin_unlock(&ip->i_mount->m_perag_lock);
-
- /* kick off background trimming */
- xfs_blockgc_queue(pag);
-
- trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1,
- _RET_IP_);
- }
+ xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
@@ -1419,19 +1316,8 @@ xfs_blockgc_clear_iflag(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_BLOCKGC_TAG);
- if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) {
- /* clear the blockgc tag from the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_BLOCKGC_TAG);
- spin_unlock(&ip->i_mount->m_perag_lock);
- trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1,
- _RET_IP_);
- }
+ xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
@@ -1492,14 +1378,13 @@ xfs_prep_free_cowblocks(
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
- void *args,
+ struct xfs_icwalk *icw,
unsigned int *lockflags)
{
- struct xfs_eofblocks *eofb = args;
bool wait;
int ret = 0;
- wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+ wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
return 0;
@@ -1507,7 +1392,7 @@ xfs_inode_free_cowblocks(
if (!xfs_prep_free_cowblocks(ip))
return 0;
- if (!xfs_inode_matches_eofb(ip, eofb))
+ if (!xfs_icwalk_match(ip, icw))
return 0;
/*
@@ -1554,14 +1439,6 @@ xfs_inode_clear_cowblocks_tag(
return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
}
-#define for_each_perag_tag(mp, next_agno, pag, tag) \
- for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
- (pag) != NULL; \
- (next_agno) = (pag)->pag_agno + 1, \
- xfs_perag_put(pag), \
- (pag) = xfs_perag_get_tag((mp), (next_agno), (tag)))
-
-
/* Disable post-EOF and CoW block auto-reclamation. */
void
xfs_blockgc_stop(
@@ -1586,23 +1463,66 @@ xfs_blockgc_start(
xfs_blockgc_queue(pag);
}
+/* Don't try to run block gc on an inode that's in any of these states. */
+#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \
+ XFS_IRECLAIMABLE | \
+ XFS_IRECLAIM)
+/*
+ * Decide if the given @ip is eligible for garbage collection of speculative
+ * preallocations, and grab it if so. Returns true if it's ready to go or
+ * false if we should just ignore it.
+ */
+static bool
+xfs_blockgc_igrab(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(rcu_read_lock_held());
+
+ /* Check for stale RCU freed inode */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino)
+ goto out_unlock_noent;
+
+ if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
+ goto out_unlock_noent;
+ spin_unlock(&ip->i_flags_lock);
+
+ /* nothing to sync during shutdown */
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return false;
+
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ return false;
+
+ /* inode is valid */
+ return true;
+
+out_unlock_noent:
+ spin_unlock(&ip->i_flags_lock);
+ return false;
+}
+
/* Scan one incore inode for block preallocations that we can remove. */
static int
xfs_blockgc_scan_inode(
struct xfs_inode *ip,
- void *args)
+ struct xfs_icwalk *icw)
{
unsigned int lockflags = 0;
int error;
- error = xfs_inode_free_eofblocks(ip, args, &lockflags);
+ error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
if (error)
goto unlock;
- error = xfs_inode_free_cowblocks(ip, args, &lockflags);
+ error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
unlock:
if (lockflags)
xfs_iunlock(ip, lockflags);
+ xfs_irele(ip);
return error;
}
@@ -1618,8 +1538,7 @@ xfs_blockgc_worker(
if (!sb_start_write_trylock(mp->m_super))
return;
- error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL,
- XFS_ICI_BLOCKGC_TAG);
+ error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
if (error)
xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
pag->pag_agno, error);
@@ -1633,12 +1552,11 @@ xfs_blockgc_worker(
int
xfs_blockgc_free_space(
struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
- trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
+ trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
- return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb,
- XFS_ICI_BLOCKGC_TAG);
+ return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
}
/*
@@ -1648,7 +1566,7 @@ xfs_blockgc_free_space(
* scan.
*
* Callers must not hold any inode's ILOCK. If requesting a synchronous scan
- * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or
+ * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
* MMAPLOCK.
*/
int
@@ -1657,9 +1575,9 @@ xfs_blockgc_free_dquots(
struct xfs_dquot *udqp,
struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp,
- unsigned int eof_flags)
+ unsigned int iwalk_flags)
{
- struct xfs_eofblocks eofb = {0};
+ struct xfs_icwalk icw = {0};
bool do_work = false;
if (!udqp && !gdqp && !pdqp)
@@ -1669,40 +1587,260 @@ xfs_blockgc_free_dquots(
* Run a scan to free blocks using the union filter to cover all
* applicable quotas in a single scan.
*/
- eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
+ icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
- eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
- eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+ icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
+ icw.icw_flags |= XFS_ICWALK_FLAG_UID;
do_work = true;
}
if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
- eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
- eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+ icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
+ icw.icw_flags |= XFS_ICWALK_FLAG_GID;
do_work = true;
}
if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
- eofb.eof_prid = pdqp->q_id;
- eofb.eof_flags |= XFS_EOF_FLAGS_PRID;
+ icw.icw_prid = pdqp->q_id;
+ icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
do_work = true;
}
if (!do_work)
return 0;
- return xfs_blockgc_free_space(mp, &eofb);
+ return xfs_blockgc_free_space(mp, &icw);
}
/* Run cow/eofblocks scans on the quotas attached to the inode. */
int
xfs_blockgc_free_quota(
struct xfs_inode *ip,
- unsigned int eof_flags)
+ unsigned int iwalk_flags)
{
return xfs_blockgc_free_dquots(ip->i_mount,
xfs_inode_dquot(ip, XFS_DQTYPE_USER),
xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
- xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags);
+ xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
+}
+
+/* XFS Inode Cache Walking Code */
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH 32
+
+
+/*
+ * Decide if we want to grab this inode in anticipation of doing work towards
+ * the goal.
+ */
+static inline bool
+xfs_icwalk_igrab(
+ enum xfs_icwalk_goal goal,
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
+{
+ switch (goal) {
+ case XFS_ICWALK_DQRELE:
+ return xfs_dqrele_igrab(ip);
+ case XFS_ICWALK_BLOCKGC:
+ return xfs_blockgc_igrab(ip);
+ case XFS_ICWALK_RECLAIM:
+ return xfs_reclaim_igrab(ip, icw);
+ default:
+ return false;
+ }
+}
+
+/*
+ * Process an inode. Each processing function must handle any state changes
+ * made by the icwalk igrab function. Return -EAGAIN to skip an inode.
+ */
+static inline int
+xfs_icwalk_process_inode(
+ enum xfs_icwalk_goal goal,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ struct xfs_icwalk *icw)
+{
+ int error = 0;
+
+ switch (goal) {
+ case XFS_ICWALK_DQRELE:
+ xfs_dqrele_inode(ip, icw);
+ break;
+ case XFS_ICWALK_BLOCKGC:
+ error = xfs_blockgc_scan_inode(ip, icw);
+ break;
+ case XFS_ICWALK_RECLAIM:
+ xfs_reclaim_inode(ip, pag);
+ break;
+ }
+ return error;
+}
+
+/*
+ * For a given per-AG structure @pag and a goal, grab qualifying inodes and
+ * process them in some manner.
+ */
+static int
+xfs_icwalk_ag(
+ struct xfs_perag *pag,
+ enum xfs_icwalk_goal goal,
+ struct xfs_icwalk *icw)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ uint32_t first_index;
+ int last_error = 0;
+ int skipped;
+ bool done;
+ int nr_found;
+
+restart:
+ done = false;
+ skipped = 0;
+ if (goal == XFS_ICWALK_RECLAIM)
+ first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
+ else
+ first_index = 0;
+ nr_found = 0;
+ do {
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ unsigned int tag = xfs_icwalk_tag(goal);
+ int error = 0;
+ int i;
+
+ rcu_read_lock();
+
+ if (tag == XFS_ICWALK_NULL_TAG)
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH);
+ else
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **) batch, first_index,
+ XFS_LOOKUP_BATCH, tag);
+
+ if (!nr_found) {
+ done = true;
+ rcu_read_unlock();
+ break;
+ }
+
+ /*
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
+ */
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || !xfs_icwalk_igrab(goal, ip, icw))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = true;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = xfs_icwalk_process_inode(goal, batch[i], pag,
+ icw);
+ if (error == -EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != -EFSCORRUPTED)
+ last_error = error;
+ }
+
+ /* bail out if the filesystem is corrupted. */
+ if (error == -EFSCORRUPTED)
+ break;
+
+ cond_resched();
+
+ if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
+ icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
+ if (icw->icw_scan_limit <= 0)
+ break;
+ }
+ } while (nr_found && !done);
+
+ if (goal == XFS_ICWALK_RECLAIM) {
+ if (done)
+ first_index = 0;
+ WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
+ }
+
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
+ return last_error;
+}
+
+/* Fetch the next (possibly tagged) per-AG structure. */
+static inline struct xfs_perag *
+xfs_icwalk_get_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ enum xfs_icwalk_goal goal)
+{
+ unsigned int tag = xfs_icwalk_tag(goal);
+
+ if (tag == XFS_ICWALK_NULL_TAG)
+ return xfs_perag_get(mp, agno);
+ return xfs_perag_get_tag(mp, agno, tag);
+}
+
+/* Walk all incore inodes to achieve a given goal. */
+static int
+xfs_icwalk(
+ struct xfs_mount *mp,
+ enum xfs_icwalk_goal goal,
+ struct xfs_icwalk *icw)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t agno = 0;
+
+ while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) {
+ agno = pag->pag_agno + 1;
+ error = xfs_icwalk_ag(pag, goal, icw);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == -EFSCORRUPTED)
+ break;
+ }
+ }
+ return last_error;
+ BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index d1fddb152420..c751cc32dc46 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -9,22 +9,27 @@
struct xfs_mount;
struct xfs_perag;
-struct xfs_eofblocks {
- __u32 eof_flags;
- kuid_t eof_uid;
- kgid_t eof_gid;
- prid_t eof_prid;
- __u64 eof_min_file_size;
+struct xfs_icwalk {
+ __u32 icw_flags;
+ kuid_t icw_uid;
+ kgid_t icw_gid;
+ prid_t icw_prid;
+ __u64 icw_min_file_size;
+ long icw_scan_limit;
};
-/*
- * tags for inode radix tree
- */
-#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
- in xfs_inode_walk */
-#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
-/* Inode has speculative preallocations (posteof or cow) to clean. */
-#define XFS_ICI_BLOCKGC_TAG 1
+/* Flags that reflect xfs_fs_eofblocks functionality. */
+#define XFS_ICWALK_FLAG_SYNC (1U << 0) /* sync/wait mode scan */
+#define XFS_ICWALK_FLAG_UID (1U << 1) /* filter by uid */
+#define XFS_ICWALK_FLAG_GID (1U << 2) /* filter by gid */
+#define XFS_ICWALK_FLAG_PRID (1U << 3) /* filter by project id */
+#define XFS_ICWALK_FLAG_MINFILESIZE (1U << 4) /* filter by min file size */
+
+#define XFS_ICWALK_FLAGS_VALID (XFS_ICWALK_FLAG_SYNC | \
+ XFS_ICWALK_FLAG_UID | \
+ XFS_ICWALK_FLAG_GID | \
+ XFS_ICWALK_FLAG_PRID | \
+ XFS_ICWALK_FLAG_MINFILESIZE)
/*
* Flags for xfs_iget()
@@ -34,11 +39,6 @@ struct xfs_eofblocks {
#define XFS_IGET_DONTCACHE 0x4
#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */
-/*
- * flags for AG inode iterator
- */
-#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */
-
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
@@ -49,16 +49,16 @@ void xfs_inode_free(struct xfs_inode *ip);
void xfs_reclaim_worker(struct work_struct *work);
void xfs_reclaim_inodes(struct xfs_mount *mp);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_count(struct xfs_mount *mp);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, unsigned long nr_to_scan);
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_mark_reclaimable(struct xfs_inode *ip);
int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
- unsigned int eof_flags);
-int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags);
-int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
+ unsigned int iwalk_flags);
+int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
+int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
@@ -68,9 +68,11 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
void xfs_blockgc_worker(struct work_struct *work);
-int xfs_inode_walk(struct xfs_mount *mp, int iter_flags,
- int (*execute)(struct xfs_inode *ip, void *args),
- void *args, int tag);
+#ifdef CONFIG_XFS_QUOTA
+int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags);
+#else
+# define xfs_dqrele_all_inodes(mp, qflags) (0)
+#endif
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1db99a909b23..990b72ae3635 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -11,7 +11,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
@@ -35,6 +34,7 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
+#include "xfs_ag.h"
kmem_zone_t *xfs_inode_zone;
@@ -45,7 +45,8 @@ kmem_zone_t *xfs_inode_zone;
#define XFS_ITRUNC_MAX_EXTENTS 2
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
-STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_inode *);
/*
* helper function to extract extent size hint from inode
@@ -778,7 +779,7 @@ xfs_inode_inherit_flags2(
* Initialise a newly allocated inode and return the in-core inode to the
* caller locked exclusively.
*/
-static int
+int
xfs_init_new_inode(
struct user_namespace *mnt_userns,
struct xfs_trans *tp,
@@ -915,57 +916,6 @@ xfs_init_new_inode(
}
/*
- * Allocates a new inode from disk and return a pointer to the incore copy. This
- * routine will internally commit the current transaction and allocate a new one
- * if we needed to allocate more on-disk free inodes to perform the requested
- * operation.
- *
- * If we are allocating quota inodes, we do not have a parent inode to attach to
- * or associate with (i.e. dp == NULL) because they are not linked into the
- * directory structure - they are attached directly to the superblock - and so
- * have no parent.
- */
-int
-xfs_dir_ialloc(
- struct user_namespace *mnt_userns,
- struct xfs_trans **tpp,
- struct xfs_inode *dp,
- umode_t mode,
- xfs_nlink_t nlink,
- dev_t rdev,
- prid_t prid,
- bool init_xattrs,
- struct xfs_inode **ipp)
-{
- struct xfs_buf *agibp;
- xfs_ino_t parent_ino = dp ? dp->i_ino : 0;
- xfs_ino_t ino;
- int error;
-
- ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
-
- /*
- * Call the space management code to pick the on-disk inode to be
- * allocated.
- */
- error = xfs_dialloc_select_ag(tpp, parent_ino, mode, &agibp);
- if (error)
- return error;
-
- if (!agibp)
- return -ENOSPC;
-
- /* Allocate an inode from the selected AG */
- error = xfs_dialloc_ag(*tpp, agibp, parent_ino, &ino);
- if (error)
- return error;
- ASSERT(ino != NULLFSINO);
-
- return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev,
- prid, init_xattrs, ipp);
-}
-
-/*
* Decrement the link count on an inode & log the change. If this causes the
* link count to go to zero, move the inode to AGI unlinked list so that it can
* be freed when the last active reference goes away via xfs_inactive().
@@ -1022,6 +972,7 @@ xfs_create(
struct xfs_dquot *pdqp = NULL;
struct xfs_trans_res *tres;
uint resblks;
+ xfs_ino_t ino;
trace_xfs_create(dp, name);
@@ -1078,14 +1029,16 @@ xfs_create(
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev,
- prid, init_xattrs, &ip);
+ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ if (!error)
+ error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
+ is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
if (error)
goto out_trans_cancel;
/*
* Now we join the directory inode to the transaction. We do not do it
- * earlier because xfs_dir_ialloc might commit the previous transaction
+ * earlier because xfs_dialloc might commit the previous transaction
* (and release all the locks). An error from here on will result in
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
@@ -1175,6 +1128,7 @@ xfs_create_tmpfile(
struct xfs_dquot *pdqp = NULL;
struct xfs_trans_res *tres;
uint resblks;
+ xfs_ino_t ino;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1199,8 +1153,10 @@ xfs_create_tmpfile(
if (error)
goto out_release_dquots;
- error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid,
- false, &ip);
+ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ if (!error)
+ error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
+ 0, 0, prid, false, &ip);
if (error)
goto out_trans_cancel;
@@ -1315,7 +1271,11 @@ xfs_link(
* Handle initial link state of O_TMPFILE inode
*/
if (VFS_I(sip)->i_nlink == 0) {
- error = xfs_iunlink_remove(tp, sip);
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
+ error = xfs_iunlink_remove(tp, pag, sip);
+ xfs_perag_put(pag);
if (error)
goto error_return;
}
@@ -1716,7 +1676,7 @@ xfs_inactive(
*/
if (VFS_I(ip)->i_mode == 0) {
ASSERT(ip->i_df.if_broot_bytes == 0);
- return;
+ goto out;
}
mp = ip->i_mount;
@@ -1724,11 +1684,11 @@ xfs_inactive(
/* If this is a read-only mount, don't do this (would generate I/O) */
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return;
+ goto out;
/* Metadata inodes require explicit resource cleanup. */
if (xfs_is_metadata_inode(ip))
- return;
+ goto out;
/* Try to clean out the cow blocks if there are any. */
if (xfs_inode_has_cow_data(ip))
@@ -1747,7 +1707,7 @@ xfs_inactive(
if (xfs_can_free_eofblocks(ip, true))
xfs_free_eofblocks(ip);
- return;
+ goto out;
}
if (S_ISREG(VFS_I(ip)->i_mode) &&
@@ -1757,14 +1717,14 @@ xfs_inactive(
error = xfs_qm_dqattach(ip);
if (error)
- return;
+ goto out;
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
error = xfs_inactive_truncate(ip);
if (error)
- return;
+ goto out;
/*
* If there are attributes associated with the file then blow them away
@@ -1774,7 +1734,7 @@ xfs_inactive(
if (XFS_IFORK_Q(ip)) {
error = xfs_attr_inactive(ip);
if (error)
- return;
+ goto out;
}
ASSERT(!ip->i_afp);
@@ -1783,12 +1743,12 @@ xfs_inactive(
/*
* Free the inode.
*/
- error = xfs_inactive_ifree(ip);
- if (error)
- return;
+ xfs_inactive_ifree(ip);
+out:
/*
- * Release the dquots held by inode, if any.
+ * We're done making metadata updates for this inode, so we can release
+ * the attached dquots.
*/
xfs_qm_dqdetach(ip);
}
@@ -2008,7 +1968,7 @@ xfs_iunlink_destroy(
STATIC int
xfs_iunlink_update_bucket(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_buf *agibp,
unsigned int bucket_index,
xfs_agino_t new_agino)
@@ -2017,10 +1977,10 @@ xfs_iunlink_update_bucket(
xfs_agino_t old_value;
int offset;
- ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino));
old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
old_value, new_agino);
/*
@@ -2044,7 +2004,7 @@ xfs_iunlink_update_bucket(
STATIC void
xfs_iunlink_update_dinode(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_buf *ibp,
struct xfs_dinode *dip,
@@ -2054,9 +2014,9 @@ xfs_iunlink_update_dinode(
struct xfs_mount *mp = tp->t_mountp;
int offset;
- ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+ ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
- trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino,
be32_to_cpu(dip->di_next_unlinked), next_agino);
dip->di_next_unlinked = cpu_to_be32(next_agino);
@@ -2074,7 +2034,7 @@ STATIC int
xfs_iunlink_update_inode(
struct xfs_trans *tp,
struct xfs_inode *ip,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agino_t next_agino,
xfs_agino_t *old_next_agino)
{
@@ -2084,7 +2044,7 @@ xfs_iunlink_update_inode(
xfs_agino_t old_value;
int error;
- ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+ ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
if (error)
@@ -2093,7 +2053,7 @@ xfs_iunlink_update_inode(
/* Make sure the old pointer isn't garbage. */
old_value = be32_to_cpu(dip->di_next_unlinked);
- if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
@@ -2116,7 +2076,7 @@ xfs_iunlink_update_inode(
}
/* Ok, update the new pointer. */
- xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
ibp, dip, &ip->i_imap, next_agino);
return 0;
out:
@@ -2137,10 +2097,10 @@ xfs_iunlink(
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
struct xfs_agi *agi;
struct xfs_buf *agibp;
xfs_agino_t next_agino;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
@@ -2149,10 +2109,12 @@ xfs_iunlink(
ASSERT(VFS_I(ip)->i_mode != 0);
trace_xfs_iunlink(ip);
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, agno, &agibp);
+ error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
if (error)
- return error;
+ goto out;
agi = agibp->b_addr;
/*
@@ -2162,9 +2124,10 @@ xfs_iunlink(
*/
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, agno, next_agino)) {
+ !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) {
xfs_buf_mark_corrupt(agibp);
- return -EFSCORRUPTED;
+ error = -EFSCORRUPTED;
+ goto out;
}
if (next_agino != NULLAGINO) {
@@ -2174,23 +2137,26 @@ xfs_iunlink(
* There is already another inode in the bucket, so point this
* inode to the current head of the list.
*/
- error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ error = xfs_iunlink_update_inode(tp, ip, pag, next_agino,
&old_agino);
if (error)
- return error;
+ goto out;
ASSERT(old_agino == NULLAGINO);
/*
* agino has been unlinked, add a backref from the next inode
* back to agino.
*/
- error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
if (error)
- return error;
+ goto out;
}
/* Point the head of the list to point to this inode. */
- return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+out:
+ xfs_perag_put(pag);
+ return error;
}
/* Return the imap, dinode pointer, and buffer for an inode. */
@@ -2238,14 +2204,13 @@ xfs_iunlink_map_ino(
STATIC int
xfs_iunlink_map_prev(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agino_t head_agino,
xfs_agino_t target_agino,
xfs_agino_t *agino,
struct xfs_imap *imap,
struct xfs_dinode **dipp,
- struct xfs_buf **bpp,
- struct xfs_perag *pag)
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_agino_t next_agino;
@@ -2257,7 +2222,8 @@ xfs_iunlink_map_prev(
/* See if our backref cache can find it faster. */
*agino = xfs_iunlink_lookup_backref(pag, target_agino);
if (*agino != NULLAGINO) {
- error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+ error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap,
+ dipp, bpp);
if (error)
return error;
@@ -2273,7 +2239,7 @@ xfs_iunlink_map_prev(
WARN_ON_ONCE(1);
}
- trace_xfs_iunlink_map_prev_fallback(mp, agno);
+ trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno);
/* Otherwise, walk the entire bucket until we find it. */
next_agino = head_agino;
@@ -2284,8 +2250,8 @@ xfs_iunlink_map_prev(
xfs_trans_brelse(tp, *bpp);
*agino = next_agino;
- error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
- bpp);
+ error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap,
+ dipp, bpp);
if (error)
return error;
@@ -2294,7 +2260,7 @@ xfs_iunlink_map_prev(
* Make sure this pointer is valid and isn't an obvious
* infinite loop.
*/
- if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) ||
next_agino == unlinked_agino) {
XFS_CORRUPTION_ERROR(__func__,
XFS_ERRLEVEL_LOW, mp,
@@ -2314,6 +2280,7 @@ xfs_iunlink_map_prev(
STATIC int
xfs_iunlink_remove(
struct xfs_trans *tp,
+ struct xfs_perag *pag,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -2321,7 +2288,6 @@ xfs_iunlink_remove(
struct xfs_buf *agibp;
struct xfs_buf *last_ibp;
struct xfs_dinode *last_dip = NULL;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t next_agino;
xfs_agino_t head_agino;
@@ -2331,7 +2297,7 @@ xfs_iunlink_remove(
trace_xfs_iunlink_remove(ip);
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, agno, &agibp);
+ error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
if (error)
return error;
agi = agibp->b_addr;
@@ -2341,7 +2307,7 @@ xfs_iunlink_remove(
* go on. Make sure the head pointer isn't garbage.
*/
head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (!xfs_verify_agino(mp, agno, head_agino)) {
+ if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
@@ -2352,7 +2318,7 @@ xfs_iunlink_remove(
* the old pointer value so that we can update whatever was previous
* to us in the list to point to whatever was next in the list.
*/
- error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino);
if (error)
return error;
@@ -2364,8 +2330,7 @@ xfs_iunlink_remove(
* this inode's backref to point from the next inode.
*/
if (next_agino != NULLAGINO) {
- error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
- NULLAGINO);
+ error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO);
if (error)
return error;
}
@@ -2375,14 +2340,13 @@ xfs_iunlink_remove(
xfs_agino_t prev_agino;
/* We need to search the list for the inode being freed. */
- error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
- &prev_agino, &imap, &last_dip, &last_ibp,
- agibp->b_pag);
+ error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp);
if (error)
return error;
/* Point the previous inode on the list to the next inode. */
- xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp,
last_dip, &imap, next_agino);
/*
@@ -2398,7 +2362,7 @@ xfs_iunlink_remove(
}
/* Point the head of the list to the next unlinked inode. */
- return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
next_agino);
}
@@ -2409,12 +2373,11 @@ xfs_iunlink_remove(
*/
static void
xfs_ifree_mark_inode_stale(
- struct xfs_buf *bp,
+ struct xfs_perag *pag,
struct xfs_inode *free_ip,
xfs_ino_t inum)
{
- struct xfs_mount *mp = bp->b_mount;
- struct xfs_perag *pag = bp->b_pag;
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_inode_log_item *iip;
struct xfs_inode *ip;
@@ -2504,10 +2467,11 @@ out_iflags_unlock:
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
*/
-STATIC int
+static int
xfs_ifree_cluster(
- struct xfs_inode *free_ip,
struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *free_ip,
struct xfs_icluster *xic)
{
struct xfs_mount *mp = free_ip->i_mount;
@@ -2569,7 +2533,7 @@ xfs_ifree_cluster(
* already marked XFS_ISTALE.
*/
for (i = 0; i < igeo->inodes_per_cluster; i++)
- xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
+ xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
@@ -2592,9 +2556,11 @@ xfs_ifree(
struct xfs_trans *tp,
struct xfs_inode *ip)
{
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
struct xfs_icluster xic = { 0 };
struct xfs_inode_log_item *iip = ip->i_itemp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2602,16 +2568,18 @@ xfs_ifree(
ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
ASSERT(ip->i_nblocks == 0);
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
/*
* Pull the on-disk inode from the AGI unlinked list.
*/
- error = xfs_iunlink_remove(tp, ip);
+ error = xfs_iunlink_remove(tp, pag, ip);
if (error)
- return error;
+ goto out;
- error = xfs_difree(tp, ip->i_ino, &xic);
+ error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
- return error;
+ goto out;
/*
* Free any local-format data sitting around before we reset the
@@ -2626,7 +2594,7 @@ xfs_ifree(
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_diflags = 0;
- ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2;
+ ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
ip->i_forkoff = 0; /* mark the attr fork not in use */
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
@@ -2645,8 +2613,9 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xic.deleted)
- error = xfs_ifree_cluster(ip, tp, &xic);
-
+ error = xfs_ifree_cluster(tp, pag, ip, &xic);
+out:
+ xfs_perag_put(pag);
return error;
}
@@ -2664,7 +2633,7 @@ xfs_iunpin(
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
+ xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
}
@@ -2794,6 +2763,19 @@ xfs_remove(
error = xfs_droplink(tp, ip);
if (error)
goto out_trans_cancel;
+
+ /*
+ * Point the unlinked child directory's ".." entry to the root
+ * directory to eliminate back-references to inodes that may
+ * get freed before the child directory is closed. If the fs
+ * gets shrunk, this can lead to dirent inode validation errors.
+ */
+ if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+ error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+ tp->t_mountp->m_sb.sb_rootino, 0);
+ if (error)
+ return error;
+ }
} else {
/*
* When removing a non-directory we need to log the parent
@@ -3250,8 +3232,13 @@ xfs_rename(
* in future.
*/
if (wip) {
+ struct xfs_perag *pag;
+
ASSERT(VFS_I(wip)->i_nlink == 0);
- error = xfs_iunlink_remove(tp, wip);
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
+ error = xfs_iunlink_remove(tp, pag, wip);
+ xfs_perag_put(pag);
if (error)
goto out_trans_cancel;
@@ -3673,16 +3660,16 @@ int
xfs_log_force_inode(
struct xfs_inode *ip)
{
- xfs_lsn_t lsn = 0;
+ xfs_csn_t seq = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
+ seq = ip->i_itemp->ili_commit_seq;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!lsn)
+ if (!seq)
return 0;
- return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
}
/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca826cfba91c..4b6703dbffb8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -431,11 +431,10 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
-int xfs_dir_ialloc(struct user_namespace *mnt_userns,
- struct xfs_trans **tpp, struct xfs_inode *dp,
- umode_t mode, xfs_nlink_t nlink, dev_t dev,
- prid_t prid, bool need_xattr,
- struct xfs_inode **ipp);
+int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp,
+ struct xfs_inode *pip, xfs_ino_t ino, umode_t mode,
+ xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs,
+ struct xfs_inode **ipp);
static inline int
xfs_itruncate_extents(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6764d12342da..35de30849fcc 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
+/*
+ * The logged size of an inode fork is always the current size of the inode
+ * fork. This means that when an inode fork is relogged, the size of the logged
+ * region is determined by the current state, not the combination of the
+ * previously logged state + the current state. This is different relogging
+ * behaviour to most other log items which will retain the size of the
+ * previously logged changes when smaller regions are relogged.
+ *
+ * Hence operations that remove data from the inode fork (e.g. shortform
+ * dir/attr remove, extent form extent removal, etc), the size of the relogged
+ * inode gets -smaller- rather than stays the same size as the previously logged
+ * size and this can result in the committing transaction reducing the amount of
+ * space being consumed by the CIL.
+ */
STATIC void
xfs_inode_item_data_fork_size(
struct xfs_inode_log_item *iip,
@@ -629,9 +643,9 @@ xfs_inode_item_committed(
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
+ INODE_ITEM(lip)->ili_commit_seq = seq;
return xfs_inode_item_release(lip);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 4b926e32831c..403b45ab9aa2 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -33,7 +33,7 @@ struct xfs_inode_log_item {
unsigned int ili_fields; /* fields to be logged */
unsigned int ili_fsync_fields; /* logged since last fsync */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
- xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
+ xfs_csn_t ili_commit_seq; /* last transaction commit */
};
static inline int xfs_inode_clean(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 7b79518b6c20..e0072a6cd2d3 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts(
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
- struct xfs_dinode *to)
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
{
to->di_magic = cpu_to_be16(from->di_magic);
to->di_mode = cpu_to_be16(from->di_mode);
@@ -182,7 +183,7 @@ xfs_log_dinode_to_disk(
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
+ to->di_lsn = cpu_to_be64(lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
to->di_flushiter = 0;
@@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2(
}
/*
- * If the inode has an LSN in it, recover the inode only if it's less
- * than the lsn of the transaction we are replaying. Note: we still
- * need to replay an owner change even though the inode is more recent
- * than the transaction as there is no guarantee that all the btree
- * blocks are more recent than this transaction, too.
+ * If the inode has an LSN in it, recover the inode only if the on-disk
+ * inode's LSN is older than the lsn of the transaction we are
+ * replaying. We can have multiple checkpoints with the same start LSN,
+ * so the current LSN being equal to the on-disk LSN doesn't necessarily
+ * mean that the on-disk inode is more recent than the change being
+ * replayed.
+ *
+ * We must check the current_lsn against the on-disk inode
+ * here because the we can't trust the log dinode to contain a valid LSN
+ * (see comment below before replaying the log dinode for details).
+ *
+ * Note: we still need to replay an owner change even though the inode
+ * is more recent than the transaction as there is no guarantee that all
+ * the btree blocks are more recent than this transaction, too.
*/
if (dip->di_version >= 3) {
xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
goto out_owner_change;
@@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2(
goto out_release;
}
- /* recover the log dinode inode into the on disk inode */
- xfs_log_dinode_to_disk(ldip, dip);
+ /*
+ * Recover the log dinode inode into the on disk inode.
+ *
+ * The LSN in the log dinode is garbage - it can be zero or reflect
+ * stale in-memory runtime state that isn't coherent with the changes
+ * logged in this transaction or the changes written to the on-disk
+ * inode. Hence we write the current lSN into the inode because that
+ * matches what xfs_iflush() would write inode the inode when flushing
+ * the changes in this transaction.
+ */
+ xfs_log_dinode_to_disk(ldip, dip, current_lsn);
fields = in_f->ilf_fields;
if (fields & XFS_ILOG_DEV)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7b1796cede10..16039ea10ac9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1065,7 +1065,24 @@ xfs_fill_fsxattr(
fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
- fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) {
+ fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+ } else if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+ /*
+ * Don't let a misaligned extent size hint on a directory
+ * escape to userspace if it won't pass the setattr checks
+ * later.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
+ FS_XFLAG_EXTSZINHERIT);
+ fa->fsx_extsize = 0;
+ } else {
+ fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+ }
+ }
+
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
fa->fsx_projid = ip->i_projid;
@@ -1292,10 +1309,10 @@ xfs_ioctl_setattr_check_extsize(
new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
/*
- * Inode verifiers on older kernels don't check that the extent size
- * hint is an integer multiple of the rt extent size on a directory
- * with both rtinherit and extszinherit flags set. Don't let sysadmins
- * misconfigure directories.
+ * Inode verifiers do not check that the extent size hint is an integer
+ * multiple of the rt extent size on a directory with both rtinherit
+ * and extszinherit flags set. Don't let sysadmins misconfigure
+ * directories.
*/
if ((new_diflags & XFS_DIFLAG_RTINHERIT) &&
(new_diflags & XFS_DIFLAG_EXTSZINHERIT)) {
@@ -1875,7 +1892,7 @@ out:
static inline int
xfs_fs_eofblocks_from_user(
struct xfs_fs_eofblocks *src,
- struct xfs_eofblocks *dst)
+ struct xfs_icwalk *dst)
{
if (src->eof_version != XFS_EOFBLOCKS_VERSION)
return -EINVAL;
@@ -1887,21 +1904,32 @@ xfs_fs_eofblocks_from_user(
memchr_inv(src->pad64, 0, sizeof(src->pad64)))
return -EINVAL;
- dst->eof_flags = src->eof_flags;
- dst->eof_prid = src->eof_prid;
- dst->eof_min_file_size = src->eof_min_file_size;
-
- dst->eof_uid = INVALID_UID;
+ dst->icw_flags = 0;
+ if (src->eof_flags & XFS_EOF_FLAGS_SYNC)
+ dst->icw_flags |= XFS_ICWALK_FLAG_SYNC;
+ if (src->eof_flags & XFS_EOF_FLAGS_UID)
+ dst->icw_flags |= XFS_ICWALK_FLAG_UID;
+ if (src->eof_flags & XFS_EOF_FLAGS_GID)
+ dst->icw_flags |= XFS_ICWALK_FLAG_GID;
+ if (src->eof_flags & XFS_EOF_FLAGS_PRID)
+ dst->icw_flags |= XFS_ICWALK_FLAG_PRID;
+ if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE)
+ dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE;
+
+ dst->icw_prid = src->eof_prid;
+ dst->icw_min_file_size = src->eof_min_file_size;
+
+ dst->icw_uid = INVALID_UID;
if (src->eof_flags & XFS_EOF_FLAGS_UID) {
- dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
- if (!uid_valid(dst->eof_uid))
+ dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid);
+ if (!uid_valid(dst->icw_uid))
return -EINVAL;
}
- dst->eof_gid = INVALID_GID;
+ dst->icw_gid = INVALID_GID;
if (src->eof_flags & XFS_EOF_FLAGS_GID) {
- dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
- if (!gid_valid(dst->eof_gid))
+ dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid);
+ if (!gid_valid(dst->icw_gid))
return -EINVAL;
}
return 0;
@@ -2164,8 +2192,8 @@ xfs_file_ioctl(
return xfs_errortag_clearall(mp);
case XFS_IOC_FREE_EOFBLOCKS: {
- struct xfs_fs_eofblocks eofb;
- struct xfs_eofblocks keofb;
+ struct xfs_fs_eofblocks eofb;
+ struct xfs_icwalk icw;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2176,14 +2204,14 @@ xfs_file_ioctl(
if (copy_from_user(&eofb, arg, sizeof(eofb)))
return -EFAULT;
- error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
+ error = xfs_fs_eofblocks_from_user(&eofb, &icw);
if (error)
return error;
- trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_);
+ trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
sb_start_write(mp->m_super);
- error = xfs_blockgc_free_space(mp, &keofb);
+ error = xfs_blockgc_free_space(mp, &icw);
sb_end_write(mp->m_super);
return error;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index dfe24b7f26e5..93c082db04b7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -543,7 +543,7 @@ xfs_stat_blksize(
* always return the realtime extent size.
*/
if (XFS_IS_REALTIME_INODE(ip))
- return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+ return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip));
/*
* Allow large block sizes to be reported to userspace programs if the
@@ -560,7 +560,7 @@ xfs_stat_blksize(
*/
if (mp->m_flags & XFS_MOUNT_LARGEIO) {
if (mp->m_swidth)
- return mp->m_swidth << mp->m_sb.sb_blocklog;
+ return XFS_FSB_TO_B(mp, mp->m_swidth);
if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
return 1U << mp->m_allocsize_log;
}
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index c4a340f1f1e1..917d51eefee3 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -21,6 +21,7 @@
#include "xfs_health.h"
#include "xfs_trans.h"
#include "xfs_pwork.h"
+#include "xfs_ag.h"
/*
* Walking Inodes in the Filesystem
@@ -51,6 +52,7 @@ struct xfs_iwalk_ag {
struct xfs_mount *mp;
struct xfs_trans *tp;
+ struct xfs_perag *pag;
/* Where do we start the traversal? */
xfs_ino_t startino;
@@ -90,7 +92,7 @@ struct xfs_iwalk_ag {
STATIC void
xfs_iwalk_ichunk_ra(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_inobt_rec_incore *irec)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
@@ -106,7 +108,7 @@ xfs_iwalk_ichunk_ra(
imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
if (imask & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, agno, agbno,
+ xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
igeo->blocks_per_cluster,
&xfs_inode_buf_ops);
}
@@ -174,26 +176,25 @@ xfs_iwalk_free(
/* For each inuse inode in each cached inobt record, call our function. */
STATIC int
xfs_iwalk_ag_recs(
- struct xfs_iwalk_ag *iwag)
+ struct xfs_iwalk_ag *iwag)
{
- struct xfs_mount *mp = iwag->mp;
- struct xfs_trans *tp = iwag->tp;
- xfs_ino_t ino;
- unsigned int i, j;
- xfs_agnumber_t agno;
- int error;
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_perag *pag = iwag->pag;
+ xfs_ino_t ino;
+ unsigned int i, j;
+ int error;
- agno = XFS_INO_TO_AGNO(mp, iwag->startino);
for (i = 0; i < iwag->nr_recs; i++) {
struct xfs_inobt_rec_incore *irec = &iwag->recs[i];
- trace_xfs_iwalk_ag_rec(mp, agno, irec);
+ trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
if (xfs_pwork_want_abort(&iwag->pwork))
return 0;
if (iwag->inobt_walk_fn) {
- error = iwag->inobt_walk_fn(mp, tp, agno, irec,
+ error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
iwag->data);
if (error)
return error;
@@ -211,7 +212,8 @@ xfs_iwalk_ag_recs(
continue;
/* Otherwise call our function. */
- ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ irec->ir_startino + j);
error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
if (error)
return error;
@@ -257,7 +259,6 @@ xfs_iwalk_del_inobt(
STATIC int
xfs_iwalk_ag_start(
struct xfs_iwalk_ag *iwag,
- xfs_agnumber_t agno,
xfs_agino_t agino,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp,
@@ -265,12 +266,13 @@ xfs_iwalk_ag_start(
{
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
+ struct xfs_perag *pag = iwag->pag;
struct xfs_inobt_rec_incore *irec;
int error;
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp);
if (error)
return error;
@@ -304,7 +306,7 @@ xfs_iwalk_ag_start(
if (XFS_IS_CORRUPT(mp, *has_more != 1))
return -EFSCORRUPTED;
- iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
+ iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
/*
@@ -345,7 +347,6 @@ out_advance:
STATIC int
xfs_iwalk_run_callbacks(
struct xfs_iwalk_ag *iwag,
- xfs_agnumber_t agno,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp,
int *has_more)
@@ -376,7 +377,7 @@ xfs_iwalk_run_callbacks(
return 0;
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp);
if (error)
return error;
@@ -390,17 +391,17 @@ xfs_iwalk_ag(
{
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
+ struct xfs_perag *pag = iwag->pag;
struct xfs_buf *agi_bp = NULL;
struct xfs_btree_cur *cur = NULL;
- xfs_agnumber_t agno;
xfs_agino_t agino;
int has_more;
int error = 0;
/* Set up our cursor at the right place in the inode btree. */
- agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+ ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
agino = XFS_INO_TO_AGINO(mp, iwag->startino);
- error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+ error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
while (!error && has_more) {
struct xfs_inobt_rec_incore *irec;
@@ -417,7 +418,7 @@ xfs_iwalk_ag(
break;
/* Make sure that we always move forward. */
- rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
+ rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
if (iwag->lastino != NULLFSINO &&
XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
error = -EFSCORRUPTED;
@@ -438,7 +439,7 @@ xfs_iwalk_ag(
* walking the inodes.
*/
if (iwag->iwalk_fn)
- xfs_iwalk_ichunk_ra(mp, agno, irec);
+ xfs_iwalk_ichunk_ra(mp, pag, irec);
/*
* If there's space in the buffer for more records, increment
@@ -458,15 +459,14 @@ xfs_iwalk_ag(
* we would be if we had been able to increment like above.
*/
ASSERT(has_more);
- error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
- &has_more);
+ error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
}
if (iwag->nr_recs == 0 || error)
goto out;
/* Walk the unprocessed records in the cache. */
- error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
+ error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
out:
xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
@@ -555,6 +555,7 @@ xfs_iwalk(
.pwork = XFS_PWORK_SINGLE_THREADED,
.lastino = NULLFSINO,
};
+ struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -565,15 +566,19 @@ xfs_iwalk(
if (error)
return error;
- for (; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
+ iwag.pag = pag;
error = xfs_iwalk_ag(&iwag);
if (error)
break;
iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
break;
+ iwag.pag = NULL;
}
+ if (iwag.pag)
+ xfs_perag_put(pag);
xfs_iwalk_free(&iwag);
return error;
}
@@ -598,6 +603,7 @@ xfs_iwalk_ag_work(
error = xfs_iwalk_ag(iwag);
xfs_iwalk_free(iwag);
out:
+ xfs_perag_put(iwag->pag);
kmem_free(iwag);
return error;
}
@@ -617,6 +623,7 @@ xfs_iwalk_threaded(
void *data)
{
struct xfs_pwork_ctl pctl;
+ struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -627,7 +634,7 @@ xfs_iwalk_threaded(
if (error)
return error;
- for (; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
struct xfs_iwalk_ag *iwag;
if (xfs_pwork_ctl_want_abort(&pctl))
@@ -635,17 +642,25 @@ xfs_iwalk_threaded(
iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
iwag->mp = mp;
+
+ /*
+ * perag is being handed off to async work, so take another
+ * reference for the async work to release.
+ */
+ atomic_inc(&pag->pag_ref);
+ iwag->pag = pag;
iwag->iwalk_fn = iwalk_fn;
iwag->data = data;
iwag->startino = startino;
iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
iwag->lastino = NULLFSINO;
xfs_pwork_queue(&pctl, &iwag->pwork);
- startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
break;
}
-
+ if (pag)
+ xfs_perag_put(pag);
if (polled)
xfs_pwork_poll(&pctl);
return xfs_pwork_destroy(&pctl);
@@ -715,6 +730,7 @@ xfs_inobt_walk(
.pwork = XFS_PWORK_SINGLE_THREADED,
.lastino = NULLFSINO,
};
+ struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -725,15 +741,19 @@ xfs_inobt_walk(
if (error)
return error;
- for (; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
+ iwag.pag = pag;
error = xfs_iwalk_ag(&iwag);
if (error)
break;
- iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
break;
+ iwag.pag = NULL;
}
+ if (iwag.pag)
+ xfs_perag_put(pag);
xfs_iwalk_free(&iwag);
return error;
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 7688663b9773..c174262a074e 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -196,6 +196,8 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, unsigned int op);
+void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
+ struct completion *done);
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a002425377b5..60ac5fd63f1e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -78,13 +78,12 @@ xlog_verify_iclog(
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t tail_lsn);
+ struct xlog_in_core *iclog);
#else
#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
-#define xlog_verify_tail_lsn(a,b,c)
+#define xlog_verify_tail_lsn(a,b)
#endif
STATIC int
@@ -487,67 +486,81 @@ out_error:
return error;
}
-static bool
-__xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- lockdep_assert_held(&log->l_icloglock);
-
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
- /* update tail before writing to iclog */
- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
-
- iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
- xlog_verify_tail_lsn(log, iclog, tail_lsn);
- /* cycle incremented when incrementing curr_block */
- return true;
- }
-
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
- return false;
-}
-
/*
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
+ *
+ * If the caller passes in a non-zero @old_tail_lsn and the current log tail
+ * does not match, there may be metadata on disk that must be persisted before
+ * this iclog is written. To satisfy that requirement, set the
+ * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
+ * log tail value.
+ *
+ * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+ * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+ * written to stable storage, and implies that a commit record is contained
+ * within the iclog. We need to ensure that the log tail does not move beyond
+ * the tail that the first commit record in the iclog ordered against, otherwise
+ * correct recovery of that checkpoint becomes dependent on future operations
+ * performed on this iclog.
+ *
+ * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
+ * current tail into iclog. Once the iclog tail is set, future operations must
+ * not modify it, otherwise they potentially violate ordering constraints for
+ * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
+ * the iclog will get zeroed on activation of the iclog after sync, so we
+ * always capture the tail lsn on the iclog on the first NEED_FUA release
+ * regardless of the number of active reference counts on this iclog.
*/
-static int
+
+int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ xfs_lsn_t old_tail_lsn)
{
+ xfs_lsn_t tail_lsn;
lockdep_assert_held(&log->l_icloglock);
+ trace_xlog_iclog_release(iclog, _RET_IP_);
if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
- if (atomic_dec_and_test(&iclog->ic_refcnt) &&
- __xlog_state_release_iclog(log, iclog)) {
- spin_unlock(&log->l_icloglock);
- xlog_sync(log, iclog);
- spin_lock(&log->l_icloglock);
- }
+ /*
+ * Grabbing the current log tail needs to be atomic w.r.t. the writing
+ * of the tail LSN into the iclog so we guarantee that the log tail does
+ * not move between deciding if a cache flush is required and writing
+ * the LSN into the iclog below.
+ */
+ if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ tail_lsn = xlog_assign_tail_lsn(log->l_mp);
- return 0;
-}
+ if (old_tail_lsn && tail_lsn != old_tail_lsn)
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
-void
-xfs_log_release_iclog(
- struct xlog_in_core *iclog)
-{
- struct xlog *log = iclog->ic_log;
- bool sync = false;
+ if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
+ !iclog->ic_header.h_tail_lsn)
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ }
- if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
- if (iclog->ic_state != XLOG_STATE_IOERROR)
- sync = __xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
+ if (!atomic_dec_and_test(&iclog->ic_refcnt))
+ return 0;
+
+ if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ return 0;
}
- if (sync)
- xlog_sync(log, iclog);
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ if (!iclog->ic_header.h_tail_lsn)
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog);
+ trace_xlog_iclog_syncing(iclog, _RET_IP_);
+
+ spin_unlock(&log->l_icloglock);
+ xlog_sync(log, iclog);
+ spin_lock(&log->l_icloglock);
+ return 0;
}
/*
@@ -770,6 +783,9 @@ xfs_log_mount_finish(
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
+ /* Make sure the log is dead if we're returning failure. */
+ ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR));
+
return error;
}
@@ -786,16 +802,34 @@ xfs_log_mount_cancel(
}
/*
- * Wait for the iclog to be written disk, or return an error if the log has been
- * shut down.
+ * Flush out the iclog to disk ensuring that device caches are flushed and
+ * the iclog hits stable storage before any completion waiters are woken.
*/
-static int
+static inline int
+xlog_force_iclog(
+ struct xlog_in_core *iclog)
+{
+ atomic_inc(&iclog->ic_refcnt);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+ return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
+}
+
+/*
+ * Wait for the iclog and all prior iclogs to be written disk as required by the
+ * log force state machine. Waiting on ic_force_wait ensures iclog completions
+ * have been ordered and callbacks run before we are woken here, hence
+ * guaranteeing that all the iclogs up to this one are on stable storage.
+ */
+int
xlog_wait_on_iclog(
struct xlog_in_core *iclog)
__releases(iclog->ic_log->l_icloglock)
{
struct xlog *log = iclog->ic_log;
+ trace_xlog_iclog_wait_on(iclog, _RET_IP_);
if (!XLOG_FORCED_SHUTDOWN(log) &&
iclog->ic_state != XLOG_STATE_ACTIVE &&
iclog->ic_state != XLOG_STATE_DIRTY) {
@@ -818,9 +852,7 @@ xlog_wait_on_iclog(
static int
xlog_write_unmount_record(
struct xlog *log,
- struct xlog_ticket *ticket,
- xfs_lsn_t *lsn,
- uint flags)
+ struct xlog_ticket *ticket)
{
struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
@@ -837,7 +869,8 @@ xlog_write_unmount_record(
/* account for space used by record data */
ticket->t_curr_res -= sizeof(ulf);
- return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+
+ return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS);
}
/*
@@ -851,15 +884,13 @@ xlog_unmount_write(
struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
- xfs_lsn_t lsn;
- uint flags = XLOG_UNMOUNT_TRANS;
int error;
error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
if (error)
goto out_err;
- error = xlog_write_unmount_record(log, tic, &lsn, flags);
+ error = xlog_write_unmount_record(log, tic);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -870,13 +901,7 @@ out_err:
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR);
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_force_iclog(iclog);
xlog_wait_on_iclog(iclog);
if (tic) {
@@ -1401,6 +1426,11 @@ xlog_alloc_log(
xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
+ if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+ log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
+ else
+ log->l_iclog_roundoff = BBSIZE;
+
xlog_grant_head_init(&log->l_reserve_head);
xlog_grant_head_init(&log->l_write_head);
@@ -1479,7 +1509,6 @@ xlog_alloc_log(
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
- spin_lock_init(&iclog->ic_callback_lock);
INIT_LIST_HEAD(&iclog->ic_callbacks);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
@@ -1546,8 +1575,7 @@ xlog_commit_record(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
- false);
+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS);
if (error)
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
@@ -1753,10 +1781,10 @@ xlog_write_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
uint64_t bno,
- unsigned int count,
- bool need_flush)
+ unsigned int count)
{
ASSERT(bno < log->l_logBBsize);
+ trace_xlog_iclog_write(iclog, _RET_IP_);
/*
* We lock the iclogbufs here so that we can serialise against I/O
@@ -1792,10 +1820,22 @@ xlog_write_iclog(
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
- REQ_IDLE | REQ_FUA;
- if (need_flush)
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+ /*
+ * For external log devices, we also need to flush the data
+ * device cache first to ensure all metadata writeback covered
+ * by the LSN in this iclog is on stable storage. This is slow,
+ * but it *must* complete before we issue the external log IO.
+ */
+ if (log->l_targ != log->l_mp->m_ddev_targp)
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev);
+ }
+ if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
+ iclog->ic_bio.bi_opf |= REQ_FUA;
+
+ iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1854,29 +1894,15 @@ xlog_calc_iclog_size(
uint32_t *roundoff)
{
uint32_t count_init, count;
- bool use_lsunit;
-
- use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1;
/* Add for LR header */
count_init = log->l_iclog_hsize + iclog->ic_offset;
+ count = roundup(count_init, log->l_iclog_roundoff);
- /* Round out the log write size */
- if (use_lsunit) {
- /* we have a v2 stripe unit to use */
- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
- } else {
- count = BBTOB(BTOBB(count_init));
- }
-
- ASSERT(count >= count_init);
*roundoff = count - count_init;
- if (use_lsunit)
- ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
- else
- ASSERT(*roundoff < BBTOB(1));
+ ASSERT(count >= count_init);
+ ASSERT(*roundoff < log->l_iclog_roundoff);
return count;
}
@@ -1912,9 +1938,9 @@ xlog_sync(
unsigned int roundoff; /* roundoff to BB or stripe */
uint64_t bno;
unsigned int size;
- bool need_flush = true, split = false;
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+ trace_xlog_iclog_sync(iclog, _RET_IP_);
count = xlog_calc_iclog_size(log, iclog, &roundoff);
@@ -1937,10 +1963,8 @@ xlog_sync(
bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
/* Do we need to split this write into 2 parts? */
- if (bno + BTOBB(count) > log->l_logBBsize) {
+ if (bno + BTOBB(count) > log->l_logBBsize)
xlog_split_iclog(log, &iclog->ic_header, bno, count);
- split = true;
- }
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
@@ -1961,22 +1985,8 @@ xlog_sync(
be64_to_cpu(iclog->ic_header.h_lsn));
}
#endif
-
- /*
- * Flush the data device before flushing the log to make sure all meta
- * data written back from the AIL actually made it to disk before
- * stamping the new log tail LSN into the log buffer. For an external
- * log we need to issue the flush explicitly, and unfortunately
- * synchronously here; for an internal log we can simply use the block
- * layer state machine for preflushes.
- */
- if (log->l_targ != log->l_mp->m_ddev_targp || split) {
- xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- need_flush = false;
- }
-
xlog_verify_iclog(log, iclog, count);
- xlog_write_iclog(log, iclog, bno, count, need_flush);
+ xlog_write_iclog(log, iclog, bno, count);
}
/*
@@ -2158,13 +2168,16 @@ static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
struct xfs_log_vec *log_vector,
- bool need_start_rec)
+ uint optype)
{
struct xfs_log_vec *lv;
- int headers = need_start_rec ? 1 : 0;
+ int headers = 0;
int len = 0;
int i;
+ if (optype & XLOG_START_TRANS)
+ headers++;
+
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
@@ -2332,7 +2345,7 @@ xlog_write_copy_finish(
return 0;
release_iclog:
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
return error;
}
@@ -2384,8 +2397,7 @@ xlog_write(
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags,
- bool need_start_rec)
+ uint optype)
{
struct xlog_in_core *iclog = NULL;
struct xfs_log_vec *lv = log_vector;
@@ -2413,8 +2425,9 @@ xlog_write(
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
- *start_lsn = 0;
+ len = xlog_write_calc_vec_length(ticket, log_vector, optype);
+ if (start_lsn)
+ *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2427,8 +2440,8 @@ xlog_write(
ASSERT(log_offset <= iclog->ic_size - 1);
ptr = iclog->ic_datap + log_offset;
- /* start_lsn is the first lsn written to. That's all we need. */
- if (!*start_lsn)
+ /* Start_lsn is the first lsn written to. */
+ if (start_lsn && !*start_lsn)
*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
/*
@@ -2441,6 +2454,7 @@ xlog_write(
int copy_len;
int copy_off;
bool ordered = false;
+ bool wrote_start_rec = false;
/* ordered log vectors have no regions to write */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
@@ -2458,13 +2472,15 @@ xlog_write(
* write a start record. Only do this for the first
* iclog we write to.
*/
- if (need_start_rec) {
+ if (optype & XLOG_START_TRANS) {
xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
sizeof(struct xlog_op_header));
+ optype &= ~XLOG_START_TRANS;
+ wrote_start_rec = true;
}
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
+ ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype);
if (!ophdr)
return -EIO;
@@ -2495,14 +2511,13 @@ xlog_write(
}
copy_len += sizeof(struct xlog_op_header);
record_cnt++;
- if (need_start_rec) {
+ if (wrote_start_rec) {
copy_len += sizeof(struct xlog_op_header);
record_cnt++;
- need_start_rec = false;
}
data_cnt += contwr ? copy_len : 0;
- error = xlog_write_copy_finish(log, iclog, flags,
+ error = xlog_write_copy_finish(log, iclog, optype,
&record_cnt, &data_cnt,
&partial_copy,
&partial_copy_len,
@@ -2546,10 +2561,10 @@ next_lv:
spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
if (commit_iclog) {
- ASSERT(flags & XLOG_COMMIT_TRANS);
+ ASSERT(optype & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
} else {
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
}
spin_unlock(&log->l_icloglock);
@@ -2562,6 +2577,7 @@ xlog_state_activate_iclog(
int *iclogs_changed)
{
ASSERT(list_empty_careful(&iclog->ic_callbacks));
+ trace_xlog_iclog_activate(iclog, _RET_IP_);
/*
* If the number of ops in this iclog indicate it just contains the
@@ -2586,6 +2602,7 @@ xlog_state_activate_iclog(
memset(iclog->ic_header.h_cycle_data, 0,
sizeof(iclog->ic_header.h_cycle_data));
iclog->ic_header.h_lsn = 0;
+ iclog->ic_header.h_tail_lsn = 0;
}
/*
@@ -2652,6 +2669,8 @@ xlog_state_clean_iclog(
{
int iclogs_changed = 0;
+ trace_xlog_iclog_clean(dirty_iclog, _RET_IP_);
+
dirty_iclog->ic_state = XLOG_STATE_DIRTY;
xlog_state_activate_iclogs(log, &iclogs_changed);
@@ -2711,6 +2730,7 @@ xlog_state_set_callback(
struct xlog_in_core *iclog,
xfs_lsn_t header_lsn)
{
+ trace_xlog_iclog_callback(iclog, _RET_IP_);
iclog->ic_state = XLOG_STATE_CALLBACK;
ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
@@ -2776,43 +2796,6 @@ xlog_state_iodone_process_iclog(
}
}
-/*
- * Keep processing entries in the iclog callback list until we come around and
- * it is empty. We need to atomically see that the list is empty and change the
- * state to DIRTY so that we don't miss any more callbacks being added.
- *
- * This function is called with the icloglock held and returns with it held. We
- * drop it while running callbacks, however, as holding it over thousands of
- * callbacks is unnecessary and causes excessive contention if we do.
- */
-static void
-xlog_state_do_iclog_callbacks(
- struct xlog *log,
- struct xlog_in_core *iclog)
- __releases(&log->l_icloglock)
- __acquires(&log->l_icloglock)
-{
- spin_unlock(&log->l_icloglock);
- spin_lock(&iclog->ic_callback_lock);
- while (!list_empty(&iclog->ic_callbacks)) {
- LIST_HEAD(tmp);
-
- list_splice_init(&iclog->ic_callbacks, &tmp);
-
- spin_unlock(&iclog->ic_callback_lock);
- xlog_cil_process_committed(&tmp);
- spin_lock(&iclog->ic_callback_lock);
- }
-
- /*
- * Pick up the icloglock while still holding the callback lock so we
- * serialise against anyone trying to add more callbacks to this iclog
- * now we've finished processing.
- */
- spin_lock(&log->l_icloglock);
- spin_unlock(&iclog->ic_callback_lock);
-}
-
STATIC void
xlog_state_do_callback(
struct xlog *log)
@@ -2841,6 +2824,8 @@ xlog_state_do_callback(
repeats++;
do {
+ LIST_HEAD(cb_list);
+
if (xlog_state_iodone_process_iclog(log, iclog,
&ioerror))
break;
@@ -2850,13 +2835,15 @@ xlog_state_do_callback(
iclog = iclog->ic_next;
continue;
}
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
+ spin_unlock(&log->l_icloglock);
- /*
- * Running callbacks will drop the icloglock which means
- * we'll have to run at least one more complete loop.
- */
+ trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
+ xlog_cil_process_committed(&cb_list);
+ trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
cycled_icloglock = true;
- xlog_state_do_iclog_callbacks(log, iclog);
+
+ spin_lock(&log->l_icloglock);
if (XLOG_FORCED_SHUTDOWN(log))
wake_up_all(&iclog->ic_force_wait);
else
@@ -2902,6 +2889,7 @@ xlog_state_done_syncing(
spin_lock(&log->l_icloglock);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+ trace_xlog_iclog_sync_done(iclog, _RET_IP_);
/*
* If we got an error, either on the first buffer, or in the case of
@@ -2975,6 +2963,8 @@ restart:
atomic_inc(&iclog->ic_refcnt); /* prevents sync */
log_offset = iclog->ic_offset;
+ trace_xlog_iclog_get_space(iclog, _RET_IP_);
+
/* On the 1st write to an iclog, figure out lsn. This works
* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
* committing to. If the offset is set, that's how many blocks
@@ -3013,7 +3003,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3140,6 +3130,7 @@ xlog_state_switch_iclogs(
{
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
assert_spin_locked(&log->l_icloglock);
+ trace_xlog_iclog_switch(iclog, _RET_IP_);
if (!eventual_size)
eventual_size = iclog->ic_offset;
@@ -3152,9 +3143,8 @@ xlog_state_switch_iclogs(
log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
/* Round up to next log-sunit */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1) {
- uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+ if (log->l_iclog_roundoff > BBSIZE) {
+ uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff);
log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
}
@@ -3178,6 +3168,35 @@ xlog_state_switch_iclogs(
}
/*
+ * Force the iclog to disk and check if the iclog has been completed before
+ * xlog_force_iclog() returns. This can happen on synchronous (e.g.
+ * pmem) or fast async storage because we drop the icloglock to issue the IO.
+ * If completion has already occurred, tell the caller so that it can avoid an
+ * unnecessary wait on the iclog.
+ */
+static int
+xlog_force_and_check_iclog(
+ struct xlog_in_core *iclog,
+ bool *completed)
+{
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ int error;
+
+ *completed = false;
+ error = xlog_force_iclog(iclog);
+ if (error)
+ return error;
+
+ /*
+ * If the iclog has already been completed and reused the header LSN
+ * will have been rewritten by completion
+ */
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+ *completed = true;
+ return 0;
+}
+
+/*
* Write out all data in the in-core log as of this exact moment in time.
*
* Data may be written to the in-core log during this call. However,
@@ -3211,7 +3230,6 @@ xfs_log_force(
{
struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
- xfs_lsn_t lsn;
XFS_STATS_INC(mp, xs_log_force);
trace_xfs_log_force(mp, 0, _RET_IP_);
@@ -3223,6 +3241,8 @@ xfs_log_force(
if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
+ trace_xlog_iclog_force(iclog, _RET_IP_);
+
if (iclog->ic_state == XLOG_STATE_DIRTY ||
(iclog->ic_state == XLOG_STATE_ACTIVE &&
atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) {
@@ -3237,39 +3257,33 @@ xfs_log_force(
iclog = iclog->ic_prev;
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
if (atomic_read(&iclog->ic_refcnt) == 0) {
- /*
- * We are the only one with access to this iclog.
- *
- * Flush it out now. There should be a roundoff of zero
- * to show that someone has already taken care of the
- * roundoff from the previous sync.
- */
- atomic_inc(&iclog->ic_refcnt);
- lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ /* We have exclusive access to this iclog. */
+ bool completed;
+
+ if (xlog_force_and_check_iclog(iclog, &completed))
goto out_error;
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+ if (completed)
goto out_unlock;
} else {
/*
- * Someone else is writing to this iclog.
- *
- * Use its call to flush out the data. However, the
- * other thread may not force out this LR, so we mark
- * it WANT_SYNC.
+ * Someone else is still writing to this iclog, so we
+ * need to ensure that when they release the iclog it
+ * gets synced immediately as we may be waiting on it.
*/
xlog_state_switch_iclogs(log, iclog, 0);
}
- } else {
- /*
- * If the head iclog is not active nor dirty, we just attach
- * ourselves to the head and go to sleep if necessary.
- */
- ;
}
+ /*
+ * The iclog we are about to wait on may contain the checkpoint pushed
+ * by the above xlog_cil_force() call, but it may not have been pushed
+ * to disk yet. Like the ACTIVE case above, we need to make sure caches
+ * are flushed when this iclog is written.
+ */
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+
if (flags & XFS_LOG_SYNC)
return xlog_wait_on_iclog(iclog);
out_unlock:
@@ -3281,15 +3295,15 @@ out_error:
}
static int
-__xfs_log_force_lsn(
- struct xfs_mount *mp,
+xlog_force_lsn(
+ struct xlog *log,
xfs_lsn_t lsn,
uint flags,
int *log_flushed,
bool already_slept)
{
- struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
+ bool completed;
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
@@ -3297,12 +3311,14 @@ __xfs_log_force_lsn(
goto out_error;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+ trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
iclog = iclog->ic_next;
if (iclog == log->l_iclog)
goto out_unlock;
}
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
/*
* We sleep here if we haven't already slept (e.g. this is the
* first time we've looked at the correct iclog buf) and the
@@ -3321,18 +3337,35 @@ __xfs_log_force_lsn(
if (!already_slept &&
(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
- XFS_STATS_INC(mp, xs_log_force_sleep);
-
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
return -EAGAIN;
}
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_force_and_check_iclog(iclog, &completed))
goto out_error;
if (log_flushed)
*log_flushed = 1;
+ if (completed)
+ goto out_unlock;
+ break;
+ case XLOG_STATE_WANT_SYNC:
+ /*
+ * This iclog may contain the checkpoint pushed by the
+ * xlog_cil_force_seq() call, but there are other writers still
+ * accessing it so it hasn't been pushed to disk yet. Like the
+ * ACTIVE case above, we need to make sure caches are flushed
+ * when this iclog is written.
+ */
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ break;
+ default:
+ /*
+ * The entire checkpoint was written by the CIL force and is on
+ * its way to disk already. It will be stable when it
+ * completes, so we don't need to manipulate caches here at all.
+ * We just need to wait for completion if necessary.
+ */
+ break;
}
if (flags & XFS_LOG_SYNC)
@@ -3360,25 +3393,29 @@ out_error:
* to disk, that thread will wake up all threads waiting on the queue.
*/
int
-xfs_log_force_lsn(
+xfs_log_force_seq(
struct xfs_mount *mp,
- xfs_lsn_t lsn,
+ xfs_csn_t seq,
uint flags,
int *log_flushed)
{
+ struct xlog *log = mp->m_log;
+ xfs_lsn_t lsn;
int ret;
- ASSERT(lsn != 0);
+ ASSERT(seq != 0);
XFS_STATS_INC(mp, xs_log_force);
- trace_xfs_log_force(mp, lsn, _RET_IP_);
+ trace_xfs_log_force(mp, seq, _RET_IP_);
- lsn = xlog_cil_force_lsn(mp->m_log, lsn);
+ lsn = xlog_cil_force_seq(log, seq);
if (lsn == NULLCOMMITLSN)
return 0;
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
- if (ret == -EAGAIN)
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
+ if (ret == -EAGAIN) {
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
+ }
return ret;
}
@@ -3407,12 +3444,11 @@ xfs_log_ticket_get(
* Figure out the total log space unit (in bytes) that would be
* required for a log ticket.
*/
-int
-xfs_log_calc_unit_res(
- struct xfs_mount *mp,
+static int
+xlog_calc_unit_res(
+ struct xlog *log,
int unit_bytes)
{
- struct xlog *log = mp->m_log;
int iclog_space;
uint num_headers;
@@ -3488,18 +3524,20 @@ xfs_log_calc_unit_res(
/* for commit-rec LR header - note: padding will subsume the ophdr */
unit_bytes += log->l_iclog_hsize;
- /* for roundoff padding for transaction data and one for commit record */
- if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
- /* log su roundoff */
- unit_bytes += 2 * mp->m_sb.sb_logsunit;
- } else {
- /* BB roundoff */
- unit_bytes += 2 * BBSIZE;
- }
+ /* roundoff padding for transaction data and one for commit record */
+ unit_bytes += 2 * log->l_iclog_roundoff;
return unit_bytes;
}
+int
+xfs_log_calc_unit_res(
+ struct xfs_mount *mp,
+ int unit_bytes)
+{
+ return xlog_calc_unit_res(mp->m_log, unit_bytes);
+}
+
/*
* Allocate and initialise a new log ticket.
*/
@@ -3516,7 +3554,7 @@ xlog_ticket_alloc(
tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
- unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+ unit_res = xlog_calc_unit_res(log, unit_bytes);
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
@@ -3600,10 +3638,10 @@ xlog_verify_grant_tail(
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t tail_lsn)
+ struct xlog_in_core *iclog)
{
- int blocks;
+ xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
+ int blocks;
if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
blocks =
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 044e02cb8921..813b972e9788 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -106,7 +106,7 @@ struct xfs_item_ops;
struct xfs_trans;
int xfs_log_force(struct xfs_mount *mp, uint flags);
-int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
+int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags,
int *log_forced);
int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
@@ -117,7 +117,6 @@ void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
-void xfs_log_release_iclog(struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
int length,
int count,
@@ -132,8 +131,6 @@ bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn, bool regrant);
void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b0ef071b3cb5..4c44bc3786c0 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -654,8 +654,11 @@ xlog_cil_push_work(
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
+ xfs_lsn_t preflush_tail_lsn;
xfs_lsn_t commit_lsn;
- xfs_lsn_t push_seq;
+ xfs_csn_t push_seq;
+ struct bio bio;
+ DECLARE_COMPLETION_ONSTACK(bdev_flush);
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -668,9 +671,14 @@ xlog_cil_push_work(
ASSERT(push_seq <= ctx->sequence);
/*
- * Wake up any background push waiters now this context is being pushed.
+ * As we are about to switch to a new, empty CIL context, we no longer
+ * need to throttle tasks on CIL space overruns. Wake any waiters that
+ * the hard push throttle may have caught so they can start committing
+ * to the new context. The ctx->xc_push_lock provides the serialisation
+ * necessary for safely using the lockless waitqueue_active() check in
+ * this context.
*/
- if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ if (waitqueue_active(&cil->xc_push_wait))
wake_up_all(&cil->xc_push_wait);
/*
@@ -719,10 +727,27 @@ xlog_cil_push_work(
spin_unlock(&cil->xc_push_lock);
/*
- * pull all the log vectors off the items in the CIL, and
- * remove the items from the CIL. We don't need the CIL lock
- * here because it's only needed on the transaction commit
- * side which is currently locked out by the flush lock.
+ * The CIL is stable at this point - nothing new will be added to it
+ * because we hold the flush lock exclusively. Hence we can now issue
+ * a cache flush to ensure all the completed metadata in the journal we
+ * are about to overwrite is on stable storage.
+ *
+ * Because we are issuing this cache flush before we've written the
+ * tail lsn to the iclog, we can have metadata IO completions move the
+ * tail forwards between the completion of this flush and the iclog
+ * being written. In this case, we need to re-issue the cache flush
+ * before the iclog write. To detect whether the log tail moves, sample
+ * the tail LSN *before* we issue the flush.
+ */
+ preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
+ xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
+ &bdev_flush);
+
+ /*
+ * Pull all the log vectors off the items in the CIL, and remove the
+ * items from the CIL. We don't need the CIL lock here because it's only
+ * needed on the transaction commit side which is currently locked out
+ * by the flush lock.
*/
lv = NULL;
num_iovecs = 0;
@@ -772,7 +797,7 @@ xlog_cil_push_work(
* that higher sequences will wait for us to write out a commit record
* before they do.
*
- * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+ * xfs_log_force_seq requires us to mirror the new sequence into the cil
* structure atomically with the addition of this sequence to the
* committing list. This also ensures that we can do unlocked checks
* against the current sequence in log forces without risking
@@ -806,7 +831,14 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
+ /*
+ * Before we format and submit the first iclog, we have to ensure that
+ * the metadata writeback ordering cache flush is complete.
+ */
+ wait_for_completion(&bdev_flush);
+
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL,
+ XLOG_START_TRANS);
if (error)
goto out_abort_free_ticket;
@@ -850,15 +882,21 @@ restart:
xfs_log_ticket_ungrant(log, tic);
- spin_lock(&commit_iclog->ic_callback_lock);
+ /*
+ * Once we attach the ctx to the iclog, a shutdown can process the
+ * iclog, run the callbacks and free the ctx. The only thing preventing
+ * this potential UAF situation here is that we are holding the
+ * icloglock. Hence we cannot access the ctx once we have attached the
+ * callbacks and dropped the icloglock.
+ */
+ spin_lock(&log->l_icloglock);
if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
- spin_unlock(&commit_iclog->ic_callback_lock);
+ spin_unlock(&log->l_icloglock);
goto out_abort;
}
ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
- spin_unlock(&commit_iclog->ic_callback_lock);
/*
* now the checkpoint commit is complete and we've attached the
@@ -870,8 +908,50 @@ restart:
wake_up_all(&cil->xc_commit_wait);
spin_unlock(&cil->xc_push_lock);
- /* release the hounds! */
- xfs_log_release_iclog(commit_iclog);
+ /*
+ * If the checkpoint spans multiple iclogs, wait for all previous iclogs
+ * to complete before we submit the commit_iclog. We can't use state
+ * checks for this - ACTIVE can be either a past completed iclog or a
+ * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a
+ * past or future iclog awaiting IO or ordered IO completion to be run.
+ * In the latter case, if it's a future iclog and we wait on it, the we
+ * will hang because it won't get processed through to ic_force_wait
+ * wakeup until this commit_iclog is written to disk. Hence we use the
+ * iclog header lsn and compare it to the commit lsn to determine if we
+ * need to wait on iclogs or not.
+ *
+ * NOTE: It is not safe to reference the ctx after this check as we drop
+ * the icloglock if we have to wait for completion of other iclogs.
+ */
+ if (ctx->start_lsn != commit_lsn) {
+ xfs_lsn_t plsn;
+
+ plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn);
+ if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) {
+ /*
+ * Waiting on ic_force_wait orders the completion of
+ * iclogs older than ic_prev. Hence we only need to wait
+ * on the most recent older iclog here.
+ */
+ xlog_wait_on_iclog(commit_iclog->ic_prev);
+ spin_lock(&log->l_icloglock);
+ }
+
+ /*
+ * We need to issue a pre-flush so that the ordering for this
+ * checkpoint is correctly preserved down to stable storage.
+ */
+ commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ }
+
+ /*
+ * The commit iclog must be written to stable storage to guarantee
+ * journal IO vs metadata writeback IO is correctly ordered on stable
+ * storage.
+ */
+ commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
+ xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
+ spin_unlock(&log->l_icloglock);
return;
out_skip:
@@ -907,7 +987,7 @@ xlog_cil_push_background(
ASSERT(!list_empty(&cil->xc_cil));
/*
- * don't do a background push if we haven't used up all the
+ * Don't do a background push if we haven't used up all the
* space available yet.
*/
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
@@ -931,9 +1011,16 @@ xlog_cil_push_background(
/*
* If we are well over the space limit, throttle the work that is being
- * done until the push work on this context has begun.
+ * done until the push work on this context has begun. Enforce the hard
+ * throttle on all transaction commits once it has been activated, even
+ * if the committing transactions have resulted in the space usage
+ * dipping back down under the hard limit.
+ *
+ * The ctx->xc_push_lock provides the serialisation necessary for safely
+ * using the lockless waitqueue_active() check in this context.
*/
- if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
+ waitqueue_active(&cil->xc_push_wait)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
ASSERT(cil->xc_ctx->space_used < log->l_logsize);
xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
@@ -1008,16 +1095,14 @@ xlog_cil_empty(
* allowed again.
*/
void
-xfs_log_commit_cil(
- struct xfs_mount *mp,
+xlog_cil_commit(
+ struct xlog *log,
struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn,
+ xfs_csn_t *commit_seq,
bool regrant)
{
- struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
- xfs_lsn_t xc_commit_lsn;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1031,10 +1116,6 @@ xfs_log_commit_cil(
xlog_cil_insert_items(log, tp);
- xc_commit_lsn = cil->xc_ctx->sequence;
- if (commit_lsn)
- *commit_lsn = xc_commit_lsn;
-
if (regrant && !XLOG_FORCED_SHUTDOWN(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
else
@@ -1057,8 +1138,10 @@ xfs_log_commit_cil(
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
if (lip->li_ops->iop_committing)
- lip->li_ops->iop_committing(lip, xc_commit_lsn);
+ lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence);
}
+ if (commit_seq)
+ *commit_seq = cil->xc_ctx->sequence;
/* xlog_cil_push_background() releases cil->xc_ctx_lock */
xlog_cil_push_background(log);
@@ -1075,9 +1158,9 @@ xfs_log_commit_cil(
* iclog flush is necessary following this call.
*/
xfs_lsn_t
-xlog_cil_force_lsn(
+xlog_cil_force_seq(
struct xlog *log,
- xfs_lsn_t sequence)
+ xfs_csn_t sequence)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx;
@@ -1173,21 +1256,17 @@ bool
xfs_log_item_in_current_chkpt(
struct xfs_log_item *lip)
{
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
if (list_empty(&lip->li_cil))
return false;
- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
-
/*
* li_seq is written on the first commit of a log item to record the
* first checkpoint it is written to. Hence if it is different to the
* current sequence, we're in a new checkpoint.
*/
- if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
- return false;
- return true;
+ return lip->li_seq == ctx->sequence;
}
/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1c6fdbf3d506..f3e79a45d60a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -50,6 +50,26 @@ enum xlog_iclog_state {
XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
};
+#define XLOG_STATE_STRINGS \
+ { XLOG_STATE_ACTIVE, "XLOG_STATE_ACTIVE" }, \
+ { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \
+ { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \
+ { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \
+ { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \
+ { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \
+ { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" }
+
+/*
+ * In core log flags
+ */
+#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
+
+#define XLOG_ICL_STRINGS \
+ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
+ { XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" }
+
+
/*
* Log ticket flags
*/
@@ -201,10 +221,8 @@ typedef struct xlog_in_core {
u32 ic_size;
u32 ic_offset;
enum xlog_iclog_state ic_state;
+ unsigned int ic_flags;
char *ic_datap; /* pointer to iclog data */
-
- /* Callback structures need their own cacheline */
- spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
struct list_head ic_callbacks;
/* reference counts need their own cacheline */
@@ -230,7 +248,7 @@ struct xfs_cil;
struct xfs_cil_ctx {
struct xfs_cil *cil;
- xfs_lsn_t sequence; /* chkpt sequence # */
+ xfs_csn_t sequence; /* chkpt sequence # */
xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_ticket *ticket; /* chkpt ticket */
@@ -268,10 +286,10 @@ struct xfs_cil {
struct xfs_cil_ctx *xc_ctx;
spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
- xfs_lsn_t xc_push_seq;
+ xfs_csn_t xc_push_seq;
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
- xfs_lsn_t xc_current_sequence;
+ xfs_csn_t xc_current_sequence;
struct work_struct xc_push_work;
wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
@@ -436,6 +454,8 @@ struct xlog {
#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
+
+ uint32_t l_iclog_roundoff;/* padding roundoff */
};
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
@@ -478,13 +498,15 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog, uint flags,
- bool need_start_rec);
+ struct xlog_in_core **commit_iclog, uint optype);
int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
struct xlog_in_core **iclog, xfs_lsn_t *lsn);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+ xfs_lsn_t log_tail_lsn);
+
/*
* When we crack an atomic LSN, we sample it first so that the value will not
* change while we are cracking it into the component values. This means we
@@ -547,19 +569,18 @@ int xlog_cil_init(struct xlog *log);
void xlog_cil_init_post_recovery(struct xlog *log);
void xlog_cil_destroy(struct xlog *log);
bool xlog_cil_empty(struct xlog *log);
+void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
+ xfs_csn_t *commit_seq, bool regrant);
/*
* CIL force routines
*/
-xfs_lsn_t
-xlog_cil_force_lsn(
- struct xlog *log,
- xfs_lsn_t sequence);
+xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
static inline void
xlog_cil_force(struct xlog *log)
{
- xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+ xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
}
/*
@@ -582,6 +603,8 @@ xlog_wait(
remove_wait_queue(wq, &wait);
}
+int xlog_wait_on_iclog(struct xlog_in_core *iclog);
+
/*
* The LSN is valid so long as it is behind the current LSN. If it isn't, this
* means that the next log record that includes this metadata could have a
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e5dd1c0c2f03..1721fce2ec94 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -25,6 +25,7 @@
#include "xfs_icache.h"
#include "xfs_error.h"
#include "xfs_buf_item.h"
+#include "xfs_ag.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -2457,8 +2458,10 @@ xlog_finish_defer_ops(
error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
- if (error)
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
return error;
+ }
/*
* Transfer to this new transaction all the dfops we captured
@@ -2741,21 +2744,17 @@ STATIC void
xlog_recover_process_iunlinks(
struct xlog *log)
{
- xfs_mount_t *mp;
- xfs_agnumber_t agno;
- xfs_agi_t *agi;
- struct xfs_buf *agibp;
- xfs_agino_t agino;
- int bucket;
- int error;
-
- mp = log->l_mp;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t agino;
+ int bucket;
+ int error;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- /*
- * Find the agi for this ag.
- */
- error = xfs_read_agi(mp, NULL, agno, &agibp);
+ for_each_perag(mp, agno, pag) {
+ error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
if (error) {
/*
* AGI is b0rked. Don't process it.
@@ -2781,7 +2780,7 @@ xlog_recover_process_iunlinks(
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
agino = xlog_recover_process_one_iunlink(mp,
- agno, agino, bucket);
+ pag->pag_agno, agino, bucket);
cond_resched();
}
}
@@ -3452,6 +3451,7 @@ xlog_recover_finish(
* this) before we get around to xfs_log_mount_cancel.
*/
xlog_recover_cancel_intents(log);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
xfs_alert(log->l_mp, "Failed to recover intents");
return error;
}
@@ -3493,27 +3493,28 @@ xlog_recover_cancel(
*/
STATIC void
xlog_recover_check_summary(
- struct xlog *log)
+ struct xlog *log)
{
- xfs_mount_t *mp;
- struct xfs_buf *agfbp;
- struct xfs_buf *agibp;
- xfs_agnumber_t agno;
- uint64_t freeblks;
- uint64_t itotal;
- uint64_t ifree;
- int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_perag *pag;
+ struct xfs_buf *agfbp;
+ struct xfs_buf *agibp;
+ xfs_agnumber_t agno;
+ uint64_t freeblks;
+ uint64_t itotal;
+ uint64_t ifree;
+ int error;
mp = log->l_mp;
freeblks = 0LL;
itotal = 0LL;
ifree = 0LL;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
+ for_each_perag(mp, agno, pag) {
+ error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp);
if (error) {
xfs_alert(mp, "%s agf read failed agno %d error %d",
- __func__, agno, error);
+ __func__, pag->pag_agno, error);
} else {
struct xfs_agf *agfp = agfbp->b_addr;
@@ -3522,10 +3523,10 @@ xlog_recover_check_summary(
xfs_buf_relse(agfbp);
}
- error = xfs_read_agi(mp, NULL, agno, &agibp);
+ error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
if (error) {
xfs_alert(mp, "%s agi read failed agno %d error %d",
- __func__, agno, error);
+ __func__, pag->pag_agno, error);
} else {
struct xfs_agi *agi = agibp->b_addr;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bdfee1943796..d0755494597f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -32,6 +32,7 @@
#include "xfs_extent_busy.h"
#include "xfs_health.h"
#include "xfs_trace.h"
+#include "xfs_ag.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -119,41 +120,6 @@ xfs_uuid_unmount(
mutex_unlock(&xfs_uuid_table_mutex);
}
-
-STATIC void
-__xfs_free_perag(
- struct rcu_head *head)
-{
- struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
-
- ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- ASSERT(atomic_read(&pag->pag_ref) == 0);
- kmem_free(pag);
-}
-
-/*
- * Free up the per-ag resources associated with the mount structure.
- */
-STATIC void
-xfs_free_perag(
- xfs_mount_t *mp)
-{
- xfs_agnumber_t agno;
- struct xfs_perag *pag;
-
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- spin_lock(&mp->m_perag_lock);
- pag = radix_tree_delete(&mp->m_perag_tree, agno);
- spin_unlock(&mp->m_perag_lock);
- ASSERT(pag);
- ASSERT(atomic_read(&pag->pag_ref) == 0);
- cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_iunlink_destroy(pag);
- xfs_buf_hash_destroy(pag);
- call_rcu(&pag->rcu_head, __xfs_free_perag);
- }
-}
-
/*
* Check size of device based on the (data/realtime) block count.
* Note: this check is used by the growfs code as well as mount.
@@ -172,96 +138,6 @@ xfs_sb_validate_fsb_count(
return 0;
}
-int
-xfs_initialize_perag(
- xfs_mount_t *mp,
- xfs_agnumber_t agcount,
- xfs_agnumber_t *maxagi)
-{
- xfs_agnumber_t index;
- xfs_agnumber_t first_initialised = NULLAGNUMBER;
- xfs_perag_t *pag;
- int error = -ENOMEM;
-
- /*
- * Walk the current per-ag tree so we don't try to initialise AGs
- * that already exist (growfs case). Allocate and insert all the
- * AGs we don't find ready for initialisation.
- */
- for (index = 0; index < agcount; index++) {
- pag = xfs_perag_get(mp, index);
- if (pag) {
- xfs_perag_put(pag);
- continue;
- }
-
- pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
- if (!pag) {
- error = -ENOMEM;
- goto out_unwind_new_pags;
- }
- pag->pag_agno = index;
- pag->pag_mount = mp;
- spin_lock_init(&pag->pag_ici_lock);
- INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
- INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-
- error = xfs_buf_hash_init(pag);
- if (error)
- goto out_free_pag;
- init_waitqueue_head(&pag->pagb_wait);
- spin_lock_init(&pag->pagb_lock);
- pag->pagb_count = 0;
- pag->pagb_tree = RB_ROOT;
-
- error = radix_tree_preload(GFP_NOFS);
- if (error)
- goto out_hash_destroy;
-
- spin_lock(&mp->m_perag_lock);
- if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
- WARN_ON_ONCE(1);
- spin_unlock(&mp->m_perag_lock);
- radix_tree_preload_end();
- error = -EEXIST;
- goto out_hash_destroy;
- }
- spin_unlock(&mp->m_perag_lock);
- radix_tree_preload_end();
- /* first new pag is fully initialized */
- if (first_initialised == NULLAGNUMBER)
- first_initialised = index;
- error = xfs_iunlink_init(pag);
- if (error)
- goto out_hash_destroy;
- spin_lock_init(&pag->pag_state_lock);
- }
-
- index = xfs_set_inode_alloc(mp, agcount);
-
- if (maxagi)
- *maxagi = index;
-
- mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
- return 0;
-
-out_hash_destroy:
- xfs_buf_hash_destroy(pag);
-out_free_pag:
- kmem_free(pag);
-out_unwind_new_pags:
- /* unwind any prior newly initialized pags */
- for (index = first_initialised; index < agcount; index++) {
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- if (!pag)
- break;
- xfs_buf_hash_destroy(pag);
- xfs_iunlink_destroy(pag);
- kmem_free(pag);
- }
- return error;
-}
-
/*
* xfs_readsb
*
@@ -983,9 +859,17 @@ xfs_mountfs(
/*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
- * read in.
+ * read in. Temporarily create per-AG space reservations for metadata
+ * btree shape changes because space freeing transactions (for inode
+ * inactivation) require the per-AG reservation in lieu of reserving
+ * blocks.
*/
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error == -ENOSPC)
+ xfs_warn(mp,
+ "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
error = xfs_log_mount_finish(mp);
+ xfs_fs_unreserve_ag_blocks(mp);
if (error) {
xfs_warn(mp, "log mount finish failed");
goto out_rtunmount;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bb67274ee23f..c78b63fe779a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -12,6 +12,7 @@ struct xfs_mru_cache;
struct xfs_ail;
struct xfs_quotainfo;
struct xfs_da_geometry;
+struct xfs_perag;
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
@@ -297,117 +298,12 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
-/* per-AG block reservation data structures*/
-struct xfs_ag_resv {
- /* number of blocks originally reserved here */
- xfs_extlen_t ar_orig_reserved;
- /* number of blocks reserved here */
- xfs_extlen_t ar_reserved;
- /* number of blocks originally asked for */
- xfs_extlen_t ar_asked;
-};
-
-/*
- * Per-ag incore structure, copies of information in agf and agi, to improve the
- * performance of allocation group selection.
- */
-typedef struct xfs_perag {
- struct xfs_mount *pag_mount; /* owner filesystem */
- xfs_agnumber_t pag_agno; /* AG this structure belongs to */
- atomic_t pag_ref; /* perag reference count */
- char pagf_init; /* this agf's entry is initialized */
- char pagi_init; /* this agi's entry is initialized */
- char pagf_metadata; /* the agf is preferred to be metadata */
- char pagi_inodeok; /* The agi is ok for inodes */
- uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
- bool pagf_agflreset; /* agfl requires reset before use */
- uint32_t pagf_flcount; /* count of blocks in freelist */
- xfs_extlen_t pagf_freeblks; /* total free blocks */
- xfs_extlen_t pagf_longest; /* longest free space */
- uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
- xfs_agino_t pagi_freecount; /* number of free inodes */
- xfs_agino_t pagi_count; /* number of allocated inodes */
-
- /*
- * Inode allocation search lookup optimisation.
- * If the pagino matches, the search for new inodes
- * doesn't need to search the near ones again straight away
- */
- xfs_agino_t pagl_pagino;
- xfs_agino_t pagl_leftrec;
- xfs_agino_t pagl_rightrec;
-
- /*
- * Bitsets of per-ag metadata that have been checked and/or are sick.
- * Callers should hold pag_state_lock before accessing this field.
- */
- uint16_t pag_checked;
- uint16_t pag_sick;
- spinlock_t pag_state_lock;
-
- spinlock_t pagb_lock; /* lock for pagb_tree */
- struct rb_root pagb_tree; /* ordered tree of busy extents */
- unsigned int pagb_gen; /* generation count for pagb_tree */
- wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
-
- atomic_t pagf_fstrms; /* # of filestreams active in this AG */
-
- spinlock_t pag_ici_lock; /* incore inode cache lock */
- struct radix_tree_root pag_ici_root; /* incore inode cache root */
- int pag_ici_reclaimable; /* reclaimable inodes */
- unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
-
- /* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
- struct rhashtable pag_buf_hash;
-
- /* for rcu-safe freeing */
- struct rcu_head rcu_head;
- int pagb_count; /* pagb slots in use */
-
- /* Blocks reserved for all kinds of metadata. */
- struct xfs_ag_resv pag_meta_resv;
- /* Blocks reserved for the reverse mapping btree. */
- struct xfs_ag_resv pag_rmapbt_resv;
-
- /* background prealloc block trimming */
- struct delayed_work pag_blockgc_work;
-
- /* reference count */
- uint8_t pagf_refcount_level;
-
- /*
- * Unlinked inode information. This incore information reflects
- * data stored in the AGI, so callers must hold the AGI buffer lock
- * or have some other means to control concurrency.
- */
- struct rhashtable pagi_unlinked_hash;
-} xfs_perag_t;
-
-static inline struct xfs_ag_resv *
-xfs_perag_resv(
- struct xfs_perag *pag,
- enum xfs_ag_resv_type type)
-{
- switch (type) {
- case XFS_AG_RESV_METADATA:
- return &pag->pag_meta_resv;
- case XFS_AG_RESV_RMAPBT:
- return &pag->pag_rmapbt_resv;
- default:
- return NULL;
- }
-}
-
-int xfs_buf_hash_init(xfs_perag_t *pag);
-void xfs_buf_hash_destroy(xfs_perag_t *pag);
+int xfs_buf_hash_init(struct xfs_perag *pag);
+void xfs_buf_hash_destroy(struct xfs_perag *pag);
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
-extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
- xfs_agnumber_t *maxagi);
extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 4bf949a89d0d..fe341f3fd419 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -23,6 +23,8 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "xfs_ialloc.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -787,8 +789,12 @@ xfs_qm_qino_alloc(
return error;
if (need_alloc) {
- error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0,
- 0, false, ipp);
+ xfs_ino_t ino;
+
+ error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
+ if (!error)
+ error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino,
+ S_IFREG, 1, 0, 0, false, ipp);
if (error) {
xfs_trans_cancel(tp);
return error;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index e3dabab44097..ebbb484c49dc 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -142,7 +142,6 @@ extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
/* dquot stuff */
extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
-extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 11f1e2fbf22f..13a56e1ea15c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -201,7 +201,8 @@ xfs_qm_scall_quotaoff(
* depend on the quota inodes (and other things) being valid as long as
* we keep the lock(s).
*/
- xfs_qm_dqrele_all_inodes(mp, flags);
+ error = xfs_dqrele_all_inodes(mp, flags);
+ ASSERT(!error);
/*
* Next we make the changes in the quota flag in the mount struct.
@@ -747,54 +748,3 @@ xfs_qm_scall_getquota_next(
xfs_qm_dqput(dqp);
return error;
}
-
-STATIC int
-xfs_dqrele_inode(
- struct xfs_inode *ip,
- void *args)
-{
- uint *flags = args;
-
- /* skip quota inodes */
- if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
- ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_gdquot == NULL);
- ASSERT(ip->i_pdquot == NULL);
- return 0;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
- xfs_qm_dqrele(ip->i_udquot);
- ip->i_udquot = NULL;
- }
- if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
- xfs_qm_dqrele(ip->i_gdquot);
- ip->i_gdquot = NULL;
- }
- if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
- xfs_qm_dqrele(ip->i_pdquot);
- ip->i_pdquot = NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
-}
-
-
-/*
- * Go thru all the inodes in the file system, releasing their dquots.
- *
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff.
- */
-void
-xfs_qm_dqrele_all_inodes(
- struct xfs_mount *mp,
- uint flags)
-{
- ASSERT(mp->m_quotainfo);
- xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode,
- &flags, XFS_ICI_NO_TAG);
-}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 060695d6d56a..c256104772cb 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -27,7 +27,7 @@
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
-#include "xfs_sb.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
@@ -144,7 +144,7 @@ xfs_reflink_find_shared(
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
@@ -755,16 +755,19 @@ int
xfs_reflink_recover_cow(
struct xfs_mount *mp)
{
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
int error = 0;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- error = xfs_refcount_recover_cow_leftovers(mp, agno);
- if (error)
+ for_each_perag(mp, agno, pag) {
+ error = xfs_refcount_recover_cow_leftovers(mp, pag);
+ if (error) {
+ xfs_perag_put(pag);
break;
+ }
}
return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 4e7be6b4ca8e..699066fb9052 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -923,16 +923,41 @@ xfs_growfs_rt(
uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
- /*
- * Initial error checking.
- */
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
- (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
- (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+
+ /* Needs to have been mounted with an rt device. */
+ if (!XFS_IS_REALTIME_MOUNT(mp))
+ return -EINVAL;
+ /*
+ * Mount should fail if the rt bitmap/summary files don't load, but
+ * we'll check anyway.
+ */
+ if (!mp->m_rbmip || !mp->m_rsumip)
+ return -EINVAL;
+
+ /* Shrink not supported. */
+ if (in->newblocks <= sbp->sb_rblocks)
+ return -EINVAL;
+
+ /* Can only change rt extent size when adding rt volume. */
+ if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize)
+ return -EINVAL;
+
+ /* Range check the extent size. */
+ if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE ||
+ XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
return -EINVAL;
- if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
+
+ /* Unsupported realtime features. */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb) ||
+ xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ nrblocks = in->newblocks;
+ error = xfs_sb_validate_fsb_count(sbp, nrblocks);
+ if (error)
return error;
/*
* Read in the last block of the device, make sure it exists.
@@ -996,7 +1021,8 @@ xfs_growfs_rt(
((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
bmbno < nrbmblocks;
bmbno++) {
- xfs_trans_t *tp;
+ struct xfs_trans *tp;
+ xfs_rfsblock_t nrblocks_step;
*nmp = *mp;
nsbp = &nmp->m_sb;
@@ -1005,10 +1031,9 @@ xfs_growfs_rt(
*/
nsbp->sb_rextsize = in->extsize;
nsbp->sb_rbmblocks = bmbno + 1;
- nsbp->sb_rblocks =
- XFS_RTMIN(nrblocks,
- nsbp->sb_rbmblocks * NBBY *
- nsbp->sb_blocksize * nsbp->sb_rextsize);
+ nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
+ nsbp->sb_rextsize;
+ nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
nsbp->sb_rextents = nsbp->sb_rblocks;
do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
ASSERT(nsbp->sb_rextents != 0);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a2dab05332ac..2c9e26a44546 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -36,6 +36,7 @@
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
#include "xfs_pwork.h"
+#include "xfs_ag.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -339,13 +340,6 @@ xfs_blkdev_put(
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-void
-xfs_blkdev_issue_flush(
- xfs_buftarg_t *buftarg)
-{
- blkdev_issue_flush(buftarg->bt_bdev);
-}
-
STATIC void
xfs_close_devices(
struct xfs_mount *mp)
@@ -667,7 +661,7 @@ xfs_fs_destroy_inode(
* reclaim path handles this more efficiently than we can here, so
* simply let background reclaim tear down all inodes.
*/
- xfs_inode_set_reclaim_tag(ip);
+ xfs_inode_mark_reclaimable(ip);
}
static void
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index d2b40dc60dfc..167d23f92ffe 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -87,7 +87,6 @@ struct xfs_buftarg;
struct block_device;
extern void xfs_flush_inodes(struct xfs_mount *mp);
-extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
xfs_agnumber_t agcount);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 99fbec32c10a..1525636f4065 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -21,6 +21,7 @@
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
+#include "xfs_ialloc.h"
/* ----- Kernel only functions below ----- */
int
@@ -161,6 +162,7 @@ xfs_symlink(
struct xfs_dquot *gdqp = NULL;
struct xfs_dquot *pdqp = NULL;
uint resblks;
+ xfs_ino_t ino;
*ipp = NULL;
@@ -223,8 +225,11 @@ xfs_symlink(
/*
* Allocate an inode for the symlink.
*/
- error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT),
- 1, 0, prid, false, &ip);
+ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
+ if (!error)
+ error = xfs_init_new_inode(mnt_userns, tp, dp, ino,
+ S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
+ false, &ip);
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 9b8d703dc9fd..7e01e00550ac 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -30,6 +30,8 @@
#include "xfs_fsmap.h"
#include "xfs_btree_staging.h"
#include "xfs_icache.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 808ae337b222..19260291ff8b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -24,6 +24,7 @@ struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
struct xlog_rec_header;
+struct xlog_in_core;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
@@ -37,7 +38,7 @@ struct xfs_trans_res;
struct xfs_inobt_rec_incore;
union xfs_btree_ptr;
struct xfs_dqtrx;
-struct xfs_eofblocks;
+struct xfs_icwalk;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -153,10 +154,8 @@ DEFINE_EVENT(xfs_perag_class, name, \
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -632,8 +631,8 @@ DEFINE_EVENT(xfs_inode_class, name, \
TP_PROTO(struct xfs_inode *ip), \
TP_ARGS(ip))
DEFINE_INODE_EVENT(xfs_iget_skip);
-DEFINE_INODE_EVENT(xfs_iget_reclaim);
-DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_recycle);
+DEFINE_INODE_EVENT(xfs_iget_recycle_fail);
DEFINE_INODE_EVENT(xfs_iget_hit);
DEFINE_INODE_EVENT(xfs_iget_miss);
@@ -1914,7 +1913,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
-DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
@@ -1944,7 +1942,6 @@ DEFINE_ATTR_EVENT(xfs_attr_refillstate);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
-DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
@@ -3730,7 +3727,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
__entry->btnum = cur->bc_btnum;
- __entry->agno = cur->bc_ag.agno;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
__entry->blocks = cur->bc_ag.afake->af_blocks;
@@ -3845,7 +3842,7 @@ TRACE_EVENT(xfs_btree_bload_block,
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
__entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
} else {
- __entry->agno = cur->bc_ag.agno;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = be32_to_cpu(ptr->s);
}
__entry->nr_records = nr_records;
@@ -3887,10 +3884,10 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
-DECLARE_EVENT_CLASS(xfs_eofblocks_class,
- TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb,
+DECLARE_EVENT_CLASS(xfs_icwalk_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw,
unsigned long caller_ip),
- TP_ARGS(mp, eofb, caller_ip),
+ TP_ARGS(mp, icw, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(__u32, flags)
@@ -3898,35 +3895,100 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class,
__field(uint32_t, gid)
__field(prid_t, prid)
__field(__u64, min_file_size)
+ __field(long, scan_limit)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->flags = eofb ? eofb->eof_flags : 0;
- __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns,
- eofb->eof_uid) : 0;
- __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns,
- eofb->eof_gid) : 0;
- __entry->prid = eofb ? eofb->eof_prid : 0;
- __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0;
+ __entry->flags = icw ? icw->icw_flags : 0;
+ __entry->uid = icw ? from_kuid(mp->m_super->s_user_ns,
+ icw->icw_uid) : 0;
+ __entry->gid = icw ? from_kgid(mp->m_super->s_user_ns,
+ icw->icw_gid) : 0;
+ __entry->prid = icw ? icw->icw_prid : 0;
+ __entry->min_file_size = icw ? icw->icw_min_file_size : 0;
+ __entry->scan_limit = icw ? icw->icw_scan_limit : 0;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS",
+ TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->flags,
__entry->uid,
__entry->gid,
__entry->prid,
__entry->min_file_size,
+ __entry->scan_limit,
(char *)__entry->caller_ip)
);
-#define DEFINE_EOFBLOCKS_EVENT(name) \
-DEFINE_EVENT(xfs_eofblocks_class, name, \
- TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \
+#define DEFINE_ICWALK_EVENT(name) \
+DEFINE_EVENT(xfs_icwalk_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \
unsigned long caller_ip), \
- TP_ARGS(mp, eofb, caller_ip))
-DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks);
-DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space);
+ TP_ARGS(mp, icw, caller_ip))
+DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks);
+DEFINE_ICWALK_EVENT(xfs_blockgc_free_space);
+
+TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE);
+TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC);
+TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING);
+TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC);
+TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK);
+TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY);
+TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR);
+
+DECLARE_EVENT_CLASS(xlog_iclog_class,
+ TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip),
+ TP_ARGS(iclog, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint32_t, state)
+ __field(int32_t, refcount)
+ __field(uint32_t, offset)
+ __field(uint32_t, flags)
+ __field(unsigned long long, lsn)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = iclog->ic_log->l_mp->m_super->s_dev;
+ __entry->state = iclog->ic_state;
+ __entry->refcount = atomic_read(&iclog->ic_refcnt);
+ __entry->offset = iclog->ic_offset;
+ __entry->flags = iclog->ic_flags;
+ __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->state, XLOG_STATE_STRINGS),
+ __entry->refcount,
+ __entry->offset,
+ __entry->lsn,
+ __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
+ (char *)__entry->caller_ip)
+
+);
+
+#define DEFINE_ICLOG_EVENT(name) \
+DEFINE_EVENT(xlog_iclog_class, name, \
+ TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \
+ TP_ARGS(iclog, caller_ip))
+
+DEFINE_ICLOG_EVENT(xlog_iclog_activate);
+DEFINE_ICLOG_EVENT(xlog_iclog_clean);
+DEFINE_ICLOG_EVENT(xlog_iclog_callback);
+DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start);
+DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done);
+DEFINE_ICLOG_EVENT(xlog_iclog_force);
+DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn);
+DEFINE_ICLOG_EVENT(xlog_iclog_get_space);
+DEFINE_ICLOG_EVENT(xlog_iclog_release);
+DEFINE_ICLOG_EVENT(xlog_iclog_switch);
+DEFINE_ICLOG_EVENT(xlog_iclog_sync);
+DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
+DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
+DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
+DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
+DEFINE_ICLOG_EVENT(xlog_iclog_write);
#endif /* _TRACE_XFS_H */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 586f2992b789..87bffd12c20c 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -839,7 +839,7 @@ __xfs_trans_commit(
bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
- xfs_lsn_t commit_lsn = -1;
+ xfs_csn_t commit_seq = 0;
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
@@ -881,7 +881,7 @@ __xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
+ xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -890,7 +890,7 @@ __xfs_trans_commit(
* log out now and wait for it.
*/
if (sync) {
- error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
+ error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL);
XFS_STATS_INC(mp, xs_trans_sync);
} else {
XFS_STATS_INC(mp, xs_trans_async);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ee42d98d9011..50da47f23a07 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -43,7 +43,7 @@ struct xfs_log_item {
struct list_head li_cil; /* CIL pointers */
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
- xfs_lsn_t li_seq; /* CIL commit seq */
+ xfs_csn_t li_seq; /* CIL commit seq */
};
/*
@@ -69,7 +69,7 @@ struct xfs_item_ops {
void (*iop_pin)(struct xfs_log_item *);
void (*iop_unpin)(struct xfs_log_item *, int remove);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
- void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
+ void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
int (*iop_recover)(struct xfs_log_item *lip,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index dbf03635869c..70055d486bf7 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -705,9 +705,6 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
return 0;
bio = bio_alloc(GFP_NOFS, nr_pages);
- if (!bio)
- return -ENOMEM;
-
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = zi->i_zsector;
bio->bi_write_hint = iocb->ki_hint;